有一些与 SMP 相关的宏,例如 CONFIG_SMP, CONFIG_X86_LOCAL_APIC, CONFIG_X86_IO_APIC, CONFIG_MULTIQUAD 和 CONFIG_VISWS。我将忽略需要 CONFIG_MULTIQUAD 或 CONFIG_VISWS 的代码,因为大多数人并不关心这些(如果不是使用 IBM 高端多处理器服务器或 SGI Visual Workstation 的话)。
BSP 执行 start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() -> wakeup_secondary_via_INIT() 来触发 AP。查看 多处理器规范 和 IA-32 手册卷 3(第 7 章 多处理器管理,以及第 8 章 高级可编程中断控制器)以获取技术细节。
在调用 smp_init() 之前,start_kernel() 做了一些事情来设置 SMP 环境
start_kernel() |-- setup_arch() | |-- parse_cmdline_early(); // SMP looks for "noht" and "acpismp=force" | | `-- /* "noht" disables HyperThreading (2 logical cpus per Xeon) */ | | if (!memcmp(from, "noht", 4)) { | | disable_x86_ht = 1; | | set_bit(X86_FEATURE_HT, disabled_x86_caps); | | } | | /* "acpismp=force" forces parsing and use of the ACPI SMP table */ | | else if (!memcmp(from, "acpismp=force", 13)) | | enable_acpi_smp_table = 1; | |-- setup_memory(); // reserve memory for MP configuration table | | |-- reserve_bootmem(PAGE_SIZE, PAGE_SIZE); | | `-- find_smp_config(); | | `-- find_intel_smp(); | | `-- smp_scan_config(); | | |-- set flag smp_found_config | | |-- set MP floating pointer mpf_found | | `-- reserve_bootmem(mpf_found, PAGE_SIZE); | |-- if (disable_x86_ht) { // if HyperThreading feature disabled | | clear_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0]); | | set_bit(X86_FEATURE_HT, disabled_x86_caps); | | enable_acpi_smp_table = 0; | | } | |-- if (test_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0])) | | enable_acpi_smp_table = 1; | |-- smp_alloc_memory(); | | `-- /* reserve AP processor's real-mode code space in low memory */ | | trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); | `-- get_smp_config(); /* get boot-time MP configuration */ | |-- config_acpi_tables(); | | |-- memset(&acpi_boot_ops, 0, sizeof(acpi_boot_ops)); | | |-- acpi_boot_ops[ACPI_APIC] = acpi_parse_madt; | | `-- /* Set have_acpi_tables to indicate using | | * MADT in the ACPI tables; Use MPS tables if failed. */ | | if (enable_acpi_smp_table && !acpi_tables_init()) | | have_acpi_tables = 1; | |-- set pic_mode | | /* =1, if the IMCR is present and PIC Mode is implemented; | | * =0, otherwise Virtual Wire Mode is implemented. */ | |-- save local APIC address in mp_lapic_addr | `-- scan for MP configuration table entries, like | MP_PROCESSOR, MP_BUS, MP_IOAPIC, MP_INTSRC and MP_LINTSRC. |-- trap_init(); | `-- init_apic_mappings(); // setup PTE for APIC | |-- /* If no local APIC can be found then set up a fake all | | * zeroes page to simulate the local APIC and another | | * one for the IO-APIC. */ | | if (!smp_found_config && detect_init_APIC()) { | | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | | apic_phys = __pa(apic_phys); | | } else | | apic_phys = mp_lapic_addr; | |-- /* map local APIC address, | | * mp_lapic_addr (0xfee00000) in most case, | | * to linear address FIXADDR_TOP (0xffffe000) */ | | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | |-- /* Fetch the APIC ID of the BSP in case we have a | | * default configuration (or the MP table is broken). */ | | if (boot_cpu_physical_apicid == -1U) | | boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); | `-- // map IOAPIC address to uncacheable linear address | set_fixmap_nocache(idx, ioapic_phys); | // Now we can use linear address to access APIC space. |-- init_IRQ(); | |-- init_ISA_irqs(); | | |-- /* An initial setup of the virtual wire mode. */ | | | init_bsp_APIC(); | | `-- init_8259A(auto_eoi=0); | `-- setup SMP/APIC interrupt handlers, esp. IPI. `-- mem_init(); `-- /* delay zapping low mapping entries for SMP: zap_low_mappings() */ |
IPI (处理器间中断),通过本地 APIC 的 CPU 到 CPU 中断,是 BSP 用来触发 AP 的机制。
请注意,在一个符合 MP 标准的系统中,“每个 CPU 都需要一个本地 APIC”。处理器不共享 APIC 本地单元地址空间(物理地址 0xFEE00000 - 0xFEEFFFFF),但将共享 APIC I/O 单元(0xFEC00000 - 0xFECFFFFF)。这两个地址空间都是不可缓存的。
BSP 调用 start_kernel() -> smp_init() -> smp_boot_cpus() 来为每个 CPU 设置数据结构并激活其余的 AP。
/////////////////////////////////////////////////////////////////////////////// static void __init smp_init(void) { /* Get other processors into their bootup holding patterns. */ smp_boot_cpus(); wait_init_idle = cpu_online_map; clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ smp_threads_ready=1; smp_commence() { /* Lets the callins below out of their loop. */ Dprintk("Setting commenced=1, go go go\n"); wmb(); atomic_set(&smp_commenced,1); } /* Wait for the other cpus to set up their idle processes */ printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); while (wait_init_idle) { cpu_relax(); // i.e. "rep;nop" barrier(); } printk("All processors have done init_idle\n"); } /////////////////////////////////////////////////////////////////////////////// void __init smp_boot_cpus(void) { // ... something not very interesting :-) /* Initialize the logical to physical CPU number mapping * and the per-CPU profiling router/multiplier */ prof_counter[0..NR_CPUS-1] = 0; prof_old_multiplier[0..NR_CPUS-1] = 0; prof_multiplier[0..NR_CPUS-1] = 0; init_cpu_to_apicid() { physical_apicid_2_cpu[0..MAX_APICID-1] = -1; logical_apicid_2_cpu[0..MAX_APICID-1] = -1; cpu_2_physical_apicid[0..NR_CPUS-1] = 0; cpu_2_logical_apicid[0..NR_CPUS-1] = 0; } /* Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ printk("CPU%d: ", 0); print_cpu_info(&cpu_data[0]); /* We have the boot CPU online for sure. */ set_bit(0, &cpu_online_map); boot_cpu_logical_apicid = logical_smp_processor_id() { GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } map_cpu_to_boot_apicid(0, boot_cpu_apicid) { physical_apicid_2_cpu[boot_cpu_apicid] = 0; cpu_2_physical_apicid[0] = boot_cpu_apicid; } global_irq_holder = 0; current->processor = 0; init_idle(); // will clear corresponding bit in wait_init_idle smp_tune_scheduling(); // ... some conditions checked connect_bsp_APIC(); // enable APIC mode if used to be PIC mode setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid) BUG(); /* Scan the CPU present map and fire up the other CPUs * via do_boot_cpu() */ Dprintk("CPU present map: %lx\n", phys_cpu_present_map); for (bit = 0; bit < NR_CPUS; bit++) { apicid = cpu_present_to_apicid(bit); /* Don't even attempt to start the boot CPU! */ if (apicid == boot_cpu_apicid) continue; if (!(phys_cpu_present_map & (1 << bit))) continue; if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) continue; do_boot_cpu(apicid); /* Make sure we unmap all failed CPUs */ if ((boot_apicid_to_cpu(apicid) == -1) && (phys_cpu_present_map & (1 << bit))) printk("CPU #%d not responding - cannot use it.\n", apicid); } // ... SMP BogoMIPS // ... B stepping processor warning // ... HyperThreading handling /* Set up all local APIC timers in the system */ setup_APIC_clocks(); /* Synchronize the TSC with the AP */ if (cpu_has_tsc && cpucount) synchronize_tsc_bp(); smp_done: zap_low_mappings(); } /////////////////////////////////////////////////////////////////////////////// static void __init do_boot_cpu (int apicid) { cpu = ++cpucount; // 1. prepare "idle process" task struct for next AP /* We can't use kernel_thread since we must avoid to * reschedule the child. */ if (fork_by_hand() < 0) panic("failed fork for CPU %d", cpu); /* We remove it from the pidhash and the runqueue * once we got the process: */ idle = init_task.prev_task; if (!idle) panic("No idle process for CPU %d", cpu); /* we schedule the first task manually */ idle->processor = cpu; idle->cpus_runnable = 1 << cpu; // only on this AP! map_cpu_to_boot_apicid(cpu, apicid) { physical_apicid_2_cpu[apicid] = cpu; cpu_2_physical_apicid[cpu] = apicid; } idle->thread.eip = (unsigned long) start_secondary; del_from_runqueue(idle); unhash_process(idle); init_tasks[cpu] = idle; // 2. prepare stack and code (CS:IP) for next AP /* start_eip had better be page-aligned! */ start_eip = setup_trampoline() { memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); /* trampoline_base was reserved in * start_kernel() -> setup_arch() -> smp_alloc_memory(), * and will be shared by all APs (one by one) */ return virt_to_phys(trampoline_base); } /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle); /* this value is used by next AP when it executes * "lss stack_start,%esp" in * linux/arch/i386/kernel/head.S:startup_32(). */ /* This grunge runs the startup process for * the targeted processor. */ atomic_set(&init_deasserted, 0); Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); local_flush_tlb(); Dprintk("1.\n"); *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; Dprintk("2.\n"); *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; Dprintk("3.\n"); // we have setup 0:467 to start_eip (trampoline_base) // 3. kick AP to run (AP gets CS:IP from 0:467) // Starting actual IPI sequence... boot_error = wakeup_secondary_via_INIT(apicid, start_eip); if (!boot_error) { // looks OK /* allow APs to start initializing. */ set_bit(cpu, &cpu_callout_map); /* ... Wait 5s total for a response */ // bit cpu in cpu_callin_map is set by AP in smp_callin() if (test_bit(cpu, &cpu_callin_map)) { print_cpu_info(&cpu_data[cpu]); } else { boot_error= 1; // marker 0xA5 set by AP in trampoline_data() if (*((volatile unsigned char *)phys_to_virt(8192)) == 0xA5) /* trampoline started but... */ printk("Stuck ??\n"); else /* trampoline code not run */ printk("Not responding.\n"); } } if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_boot_apicid(cpu, apicid); clear_bit(cpu, &cpu_callout_map); /* set in do_boot_cpu() */ clear_bit(cpu, &cpu_initialized); /* set in cpu_init() */ clear_bit(cpu, &cpu_online_map); /* set in smp_callin() */ cpucount--; } /* mark "stuck" area as not stuck */ *((volatile unsigned long *)phys_to_virt(8192)) = 0; } |
此文件包含 16 位实模式 AP 启动代码。BSP 在 start_kernel() -> setup_arch() -> smp_alloc_memory() 中保留了内存空间 trampoline_base。在 BSP 触发 AP 之前,它将 trampoline 代码,在 trampoline_data 和 trampoline_end 之间,复制到 trampoline_base(在 do_boot_cpu() -> setup_trampoline() 中)。BSP 设置 0:467 指向 trampoline_base,以便 AP 将从这里运行。
/////////////////////////////////////////////////////////////////////////////// trampoline_data() { r_base: wbinvd; // Needed for NUMA-Q should be harmless for other DS = CS; BX = 1; // Flag an SMP trampoline cli; // write marker for master knows we're running trampoline_base = 0xA5A5A5A5; lidt idt_48; lgdt gdt_48; AX = 1; lmsw AX; // protected mode! goto flush_instr; flush_instr: goto CS:100000; // see linux/arch/i386/kernel/head.S:startup_32() } idt_48: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48: .word 0x0800 # gdt limit = 2048, 256 GDT entries .long gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) .globl SYMBOL_NAME(trampoline_end) SYMBOL_NAME_LABEL(trampoline_end) |
与 BSP 不同,在 第 6.4 节 中的 linux/arch/i386/kernel/head.S:startup_32() 的末尾,AP 将调用 initialize_secondary() 而不是 start_kernel()。
/* Everything has been set up for the secondary * CPUs - they just need to reload everything * from the task structure * This function must not return. */ void __init initialize_secondary(void) { /* We don't actually need to load the full TSS, * basically just the stack pointer and the eip. */ asm volatile( "movl %0,%%esp\n\t" "jmp *%1" : :"r" (current->thread.esp),"r" (current->thread.eip)); } |
所有 AP 都等待来自 BSP 的信号 smp_commenced,该信号在 第 8.2 节 smp_init() -> smp_commence() 中触发。在收到此信号后,它们将运行“空闲”进程。
/////////////////////////////////////////////////////////////////////////////// int __init start_secondary(void *unused) { /* Dont put anything before smp_callin(), SMP * booting is too fragile that we want to limit the * things done here to the most necessary things. */ cpu_init(); smp_callin(); while (!atomic_read(&smp_commenced)) rep_nop(); /* low-memory mappings have been cleared, flush them from * the local TLBs too. */ local_flush_tlb(); return cpu_idle(); // never return, see Section 7.3 } |