8. SMP 启动

有一些与 SMP 相关的宏,例如 CONFIG_SMP, CONFIG_X86_LOCAL_APIC, CONFIG_X86_IO_APIC, CONFIG_MULTIQUADCONFIG_VISWS。我将忽略需要 CONFIG_MULTIQUADCONFIG_VISWS 的代码,因为大多数人并不关心这些(如果不是使用 IBM 高端多处理器服务器或 SGI Visual Workstation 的话)。

BSP 执行 start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() -> wakeup_secondary_via_INIT() 来触发 AP。查看 多处理器规范 和 IA-32 手册卷 3(第 7 章 多处理器管理,以及第 8 章 高级可编程中断控制器)以获取技术细节。

8.1. 在 smp_init() 之前

在调用 smp_init() 之前,start_kernel() 做了一些事情来设置 SMP 环境
start_kernel()
|-- setup_arch()
|   |-- parse_cmdline_early();  // SMP looks for "noht" and "acpismp=force"
|   |   `-- /* "noht" disables HyperThreading (2 logical cpus per Xeon) */
|   |       if (!memcmp(from, "noht", 4)) {
|   |           disable_x86_ht = 1;
|   |           set_bit(X86_FEATURE_HT, disabled_x86_caps);
|   |       }
|   |       /* "acpismp=force" forces parsing and use of the ACPI SMP table */
|   |       else if (!memcmp(from, "acpismp=force", 13))
|   |           enable_acpi_smp_table = 1;
|   |-- setup_memory();         // reserve memory for MP configuration table
|   |   |-- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
|   |   `-- find_smp_config();
|   |       `-- find_intel_smp();
|   |           `-- smp_scan_config();
|   |               |-- set flag smp_found_config
|   |               |-- set MP floating pointer mpf_found
|   |               `-- reserve_bootmem(mpf_found, PAGE_SIZE);
|   |-- if (disable_x86_ht) {   // if HyperThreading feature disabled
|   |       clear_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0]);
|   |       set_bit(X86_FEATURE_HT, disabled_x86_caps);
|   |       enable_acpi_smp_table = 0;
|   |   }
|   |-- if (test_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0]))
|   |       enable_acpi_smp_table = 1;
|   |-- smp_alloc_memory();
|   |   `-- /* reserve AP processor's real-mode code space in low memory */
|   |       trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
|   `-- get_smp_config();     /* get boot-time MP configuration */
|       |-- config_acpi_tables();
|       |   |-- memset(&acpi_boot_ops, 0, sizeof(acpi_boot_ops));
|       |   |-- acpi_boot_ops[ACPI_APIC] = acpi_parse_madt;
|       |   `-- /* Set have_acpi_tables to indicate using
|       |        * MADT in the ACPI tables; Use MPS tables if failed. */
|       |       if (enable_acpi_smp_table && !acpi_tables_init())
|       |           have_acpi_tables = 1;
|       |-- set pic_mode
|       |   /* =1, if the IMCR is present and PIC Mode is implemented;
|       |    * =0, otherwise Virtual Wire Mode is implemented. */
|       |-- save local APIC address in mp_lapic_addr
|       `-- scan for MP configuration table entries, like
|             MP_PROCESSOR, MP_BUS, MP_IOAPIC, MP_INTSRC and MP_LINTSRC.
|-- trap_init();
|   `-- init_apic_mappings();   // setup PTE for APIC
|       |-- /* If no local APIC can be found then set up a fake all
|       |    * zeroes page to simulate the local APIC and another
|       |    * one for the IO-APIC. */
|       |   if (!smp_found_config && detect_init_APIC()) {
|       |       apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
|       |       apic_phys = __pa(apic_phys);
|       |   } else
|       |       apic_phys = mp_lapic_addr;
|       |-- /* map local APIC address,
|       |    *   mp_lapic_addr (0xfee00000) in most case,
|       |    *   to linear address FIXADDR_TOP (0xffffe000) */
|       |   set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
|       |-- /* Fetch the APIC ID of the BSP in case we have a
|       |    * default configuration (or the MP table is broken). */
|       |   if (boot_cpu_physical_apicid == -1U)
|       |       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
|       `-- // map IOAPIC address to uncacheable linear address
|           set_fixmap_nocache(idx, ioapic_phys);
|       // Now we can use linear address to access APIC space.
|-- init_IRQ();
|   |-- init_ISA_irqs();
|   |   |-- /* An initial setup of the virtual wire mode. */
|   |   |   init_bsp_APIC();
|   |   `-- init_8259A(auto_eoi=0);
|   `-- setup SMP/APIC interrupt handlers, esp. IPI.
`-- mem_init();
    `-- /* delay zapping low mapping entries for SMP: zap_low_mappings() */

IPI (处理器间中断),通过本地 APIC 的 CPU 到 CPU 中断,是 BSP 用来触发 AP 的机制。

请注意,在一个符合 MP 标准的系统中,“每个 CPU 都需要一个本地 APIC”。处理器不共享 APIC 本地单元地址空间(物理地址 0xFEE00000 - 0xFEEFFFFF),但将共享 APIC I/O 单元(0xFEC00000 - 0xFECFFFFF)。这两个地址空间都是不可缓存的。

8.2. smp_init()

BSP 调用 start_kernel() -> smp_init() -> smp_boot_cpus() 来为每个 CPU 设置数据结构并激活其余的 AP。
///////////////////////////////////////////////////////////////////////////////
static void __init smp_init(void)
{
        /* Get other processors into their bootup holding patterns. */
        smp_boot_cpus();
        wait_init_idle = cpu_online_map;
        clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */

        smp_threads_ready=1;
        smp_commence() {
                /* Lets the callins below out of their loop. */
                Dprintk("Setting commenced=1, go go go\n");
                wmb();
                atomic_set(&smp_commenced,1);
        }

        /* Wait for the other cpus to set up their idle processes */
        printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
        while (wait_init_idle) {
                cpu_relax();    // i.e. "rep;nop"
                barrier();
        }
        printk("All processors have done init_idle\n");
}

///////////////////////////////////////////////////////////////////////////////
void __init smp_boot_cpus(void)
{
        // ... something not very interesting :-)

        /* Initialize the logical to physical CPU number mapping
         * and the per-CPU profiling router/multiplier */
        prof_counter[0..NR_CPUS-1] = 0;
        prof_old_multiplier[0..NR_CPUS-1] = 0;
        prof_multiplier[0..NR_CPUS-1] = 0;

        init_cpu_to_apicid() {
                physical_apicid_2_cpu[0..MAX_APICID-1] = -1;
                logical_apicid_2_cpu[0..MAX_APICID-1] = -1;
                cpu_2_physical_apicid[0..NR_CPUS-1] = 0;
                cpu_2_logical_apicid[0..NR_CPUS-1] = 0;
        }

        /* Setup boot CPU information */
        smp_store_cpu_info(0); /* Final full version of the data */
        printk("CPU%d: ", 0);
        print_cpu_info(&cpu_data[0]);

        /* We have the boot CPU online for sure. */
        set_bit(0, &cpu_online_map);
        boot_cpu_logical_apicid = logical_smp_processor_id() {
                GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
        }
        map_cpu_to_boot_apicid(0, boot_cpu_apicid) {
               physical_apicid_2_cpu[boot_cpu_apicid] = 0;
               cpu_2_physical_apicid[0] = boot_cpu_apicid;
        }

        global_irq_holder = 0;
        current->processor = 0;
        init_idle();    // will clear corresponding bit in wait_init_idle
        smp_tune_scheduling();

        // ... some conditions checked

        connect_bsp_APIC();     // enable APIC mode if used to be PIC mode
        setup_local_APIC();

        if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
                BUG();

        /* Scan the CPU present map and fire up the other CPUs
         *   via do_boot_cpu() */
        Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
        for (bit = 0; bit < NR_CPUS; bit++) {
                apicid = cpu_present_to_apicid(bit);
                /* Don't even attempt to start the boot CPU! */
                if (apicid == boot_cpu_apicid)
                        continue;
                if (!(phys_cpu_present_map & (1 << bit)))
                        continue;
                if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
                        continue;
                do_boot_cpu(apicid);
                /* Make sure we unmap all failed CPUs */
                if ((boot_apicid_to_cpu(apicid) == -1) &&
                                (phys_cpu_present_map & (1 << bit)))
                        printk("CPU #%d not responding - cannot use it.\n",
                                                                apicid);
        }

        // ... SMP BogoMIPS
        // ... B stepping processor warning
        // ... HyperThreading handling

        /* Set up all local APIC timers in the system */
        setup_APIC_clocks();

        /* Synchronize the TSC with the AP */
        if (cpu_has_tsc && cpucount)
                synchronize_tsc_bp();

smp_done:
        zap_low_mappings();
}

///////////////////////////////////////////////////////////////////////////////
static void __init do_boot_cpu (int apicid)
{
        cpu = ++cpucount;

        // 1. prepare "idle process" task struct for next AP

        /* We can't use kernel_thread since we must avoid to
         * reschedule the child. */
        if (fork_by_hand() < 0)
                panic("failed fork for CPU %d", cpu);
        /* We remove it from the pidhash and the runqueue
         * once we got the process: */
        idle = init_task.prev_task;
        if (!idle)
                panic("No idle process for CPU %d", cpu);

        /* we schedule the first task manually */
        idle->processor = cpu;
        idle->cpus_runnable = 1 << cpu; // only on this AP!

        map_cpu_to_boot_apicid(cpu, apicid) {
                physical_apicid_2_cpu[apicid] = cpu;
                cpu_2_physical_apicid[cpu] = apicid;
        }

        idle->thread.eip = (unsigned long) start_secondary;

        del_from_runqueue(idle);
        unhash_process(idle);
        init_tasks[cpu] = idle;

        // 2. prepare stack and code (CS:IP) for next AP

        /* start_eip had better be page-aligned! */
        start_eip = setup_trampoline() {
                memcpy(trampoline_base, trampoline_data,
                        trampoline_end - trampoline_data);
                /* trampoline_base was reserved in
                 * start_kernel() -> setup_arch() -> smp_alloc_memory(),
                 * and will be shared by all APs (one by one) */
                return virt_to_phys(trampoline_base);
        }

        /* So we see what's up */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
        stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
        /* this value is used by next AP when it executes
         *   "lss stack_start,%esp" in
         *   linux/arch/i386/kernel/head.S:startup_32(). */

        /* This grunge runs the startup process for
         * the targeted processor. */
        atomic_set(&init_deasserted, 0);
        Dprintk("Setting warm reset code and vector.\n");

        CMOS_WRITE(0xa, 0xf);
        local_flush_tlb();
        Dprintk("1.\n");
        *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
        Dprintk("2.\n");
        *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
        Dprintk("3.\n");
        // we have setup 0:467 to start_eip (trampoline_base)

        // 3. kick AP to run (AP gets CS:IP from 0:467)

        // Starting actual IPI sequence...
        boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
        if (!boot_error) {      // looks OK
                /* allow APs to start initializing. */
                set_bit(cpu, &cpu_callout_map);

                /* ... Wait 5s total for a response */

                // bit cpu in cpu_callin_map is set by AP in smp_callin()
                if (test_bit(cpu, &cpu_callin_map)) {
                        print_cpu_info(&cpu_data[cpu]);
                } else {
                        boot_error= 1;
                        // marker 0xA5 set by AP in trampoline_data()
                        if (*((volatile unsigned char *)phys_to_virt(8192))
                                        == 0xA5)
                                /* trampoline started but... */
                                printk("Stuck ??\n");
                        else
                                /* trampoline code not run */
                                printk("Not responding.\n");
                }
        }
        if (boot_error) {
                /* Try to put things back the way they were before ... */
                unmap_cpu_to_boot_apicid(cpu, apicid);
                clear_bit(cpu, &cpu_callout_map); /* set in do_boot_cpu() */
                clear_bit(cpu, &cpu_initialized); /* set in cpu_init() */
                clear_bit(cpu, &cpu_online_map);  /* set in smp_callin() */
                cpucount--;
        }

        /* mark "stuck" area as not stuck */
        *((volatile unsigned long *)phys_to_virt(8192)) = 0;
}
不要将 start_secondary()trampoline_data() 混淆。前者是 AP “空闲”进程任务结构 EIP 值,后者是 AP 在 BSP 启动后(使用 wakeup_secondary_via_INIT())运行的实模式代码。

8.3. linux/arch/i386/kernel/trampoline.S

此文件包含 16 位实模式 AP 启动代码。BSP 在 start_kernel() -> setup_arch() -> smp_alloc_memory() 中保留了内存空间 trampoline_base。在 BSP 触发 AP 之前,它将 trampoline 代码,在 trampoline_datatrampoline_end 之间,复制到 trampoline_base(在 do_boot_cpu() -> setup_trampoline() 中)。BSP 设置 0:467 指向 trampoline_base,以便 AP 将从这里运行。

///////////////////////////////////////////////////////////////////////////////
trampoline_data()
{
r_base:
        wbinvd;         // Needed for NUMA-Q should be harmless for other
        DS = CS;
        BX = 1;         // Flag an SMP trampoline
        cli;

        // write marker for master knows we're running
        trampoline_base = 0xA5A5A5A5;

        lidt idt_48;
        lgdt gdt_48;

        AX = 1;
        lmsw AX;        // protected mode!
        goto flush_instr;
flush_instr:
        goto CS:100000; // see linux/arch/i386/kernel/head.S:startup_32()
}

idt_48:
        .word   0                       # idt limit = 0
        .word   0, 0                    # idt base = 0L

gdt_48:
        .word   0x0800                  # gdt limit = 2048, 256 GDT entries
        .long   gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU)

.globl SYMBOL_NAME(trampoline_end)
SYMBOL_NAME_LABEL(trampoline_end)
请注意,当 AP 跳转到时,BX=1linux/arch/i386/kernel/head.S:startup_32(),这与 BSP 的情况不同 (BX=0)。请参阅 第 6 节

8.4. initialize_secondary()

与 BSP 不同,在 第 6.4 节 中的 linux/arch/i386/kernel/head.S:startup_32() 的末尾,AP 将调用 initialize_secondary() 而不是 start_kernel()

/* Everything has been set up for the secondary
 * CPUs - they just need to reload everything
 * from the task structure
 * This function must not return. */
void __init initialize_secondary(void)
{
        /* We don't actually need to load the full TSS,
         * basically just the stack pointer and the eip. */
        asm volatile(
                "movl %0,%%esp\n\t"
                "jmp *%1"
                :
                :"r" (current->thread.esp),"r" (current->thread.eip));
}
由于 BSP 调用了 do_boot_cpu()thread.eip 设置为 start_secondary(),因此 AP 的控制权被传递给此函数。AP 使用一个新的堆栈帧,该堆栈帧由 BSP 在 do_boot_cpu() -> fork_by_hand() -> do_fork() 中设置。

8.5. start_secondary()

所有 AP 都等待来自 BSP 的信号 smp_commenced,该信号在 第 8.2 节 smp_init() -> smp_commence() 中触发。在收到此信号后,它们将运行“空闲”进程。
///////////////////////////////////////////////////////////////////////////////
int __init start_secondary(void *unused)
{
        /* Dont put anything before smp_callin(), SMP
         * booting is too fragile that we want to limit the
         * things done here to the most necessary things. */
        cpu_init();
        smp_callin();
        while (!atomic_read(&smp_commenced))
                rep_nop();
        /* low-memory mappings have been cleared, flush them from
         * the local TLBs too. */
        local_flush_tlb();
        return cpu_idle();      // never return, see Section 7.3
}
cpu_idle() -> init_idle() 将清除 wait_init_idle 中相应的位,并最终使 BSP 完成 smp_init() 并继续执行 start_kernel() 中的后续函数(即 rest_init())。

8.6. 参考