6. linux/arch/i386/kernel/head.S

常驻内核镜像linux/vmlinux最终就位!它需要两个输入

ESI 指向来自 16 位实模式代码的参数区域,该区域稍后将被复制到 empty_zero_page。ESI 仅对 BSP 有效。

BSP(引导启动处理器)和 AP(应用处理器)是 Intel 术语。请查阅 IA-32 手册(Vol.3. 第 7.5 章 多处理器 (MP) 初始化)和 多处理器规范,了解 MP 初始化问题。

从软件的角度来看,在多处理器系统中,BSP 和 AP 共享物理内存,但使用各自的寄存器组。BSP 首先运行内核代码,设置操作系统执行环境,并触发 AP 也运行。AP 将处于休眠状态,直到 BSP 启动它。

6.1. 启用分页

.text
///////////////////////////////////////////////////////////////////////////////
startup_32()
{
        /* set segments to known values */
        cld;
        DS = ES = FS = GS = __KERNEL_DS;

#ifdef CONFIG_SMP
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
        /* long mmu_cr4_features defined in linux/arch/i386/kernel/setup.c
         * __PAGE_OFFSET = 0xC0000000, i.e. 3G */

        // AP with CR4 support (> Intel 486) will copy CR4 from BSP
        if (BX && cr4_bits) {
                // turn on paging options (PSE, PAE, ...)
                CR4 |= cr4_bits;
        } else
#endif
        {
                /* only BSP initializes page tables (pg0..empty_zero_page-1)
                 *   pg0 at .org 0x2000
                 *   empty_zero_page at .org 0x4000
                 *   total (0x4000-0x2000)/4 = 0x0800 entries */
                pg0 = {
                        0x00000007,             // 7 = PRESENT + RW + USER
                        0x00001007,             // 0x1000 = 4096 = 4K
                        0x00002007,
                        ...
                pg1:    0x00400007,
                        ...
                        0x007FF007              // total 8M
                empty_zero_page:
                };
        }
为什么在引用内核符号时必须添加 "-__PAGE_OFFSET",例如像 pg0 这样?

linux/arch/i386/vmlinux.lds中,我们有
  . = 0xC0000000 + 0x100000;
  _text = .;                    /* Text and read-only data */
  .text : {
        *(.text)
...
由于 pg0 位于linux/arch/i386/kernel/head.o.text 节的偏移量 0x2000 处,它是要链接的第一个文件linux/vmlinux,因此它将位于输出节 .text 中的偏移量 0x2000 处。因此,链接后它将位于地址 0xC0000000+0x100000+0x2000。
[root@localhost boot]# nm --defined /boot/vmlinux-2.4.20-28.9 | grep 'startup_32
\|mmu_cr4_features\|pg0\|\<empty_zero_page\>' | sort
c0100000 t startup_32
c0102000 T pg0
c0104000 T empty_zero_page
c0376404 B mmu_cr4_features
在未启用分页的保护模式下,线性地址将直接映射到物理地址。"movl $pg0-__PAGE_OFFSET,%edi" 将设置 EDI=0x102000,这等于 pg0 的物理地址(因为linux/vmlinux

mmu_cr4_features.bss 节中,并且位于上述示例中的物理地址 0x376404。

在页表初始化后,可以启用分页。
        // set page directory base pointer, physical address
        CR3 = swapper_pg_dir - __PAGE_OFFSET;
        // paging enabled!
        CR0 |= 0x80000000;      // set PG bit
        goto 1f;                // flush prefetch-queue
1:
        EAX = &1f;              // address following the next instruction
        goto *(EAX);            // relocate EIP
1:
        SS:ESP = *stack_start;
页目录 swapper_pg_dir (详见 第 6.5 节 中的定义),连同页表 pg0pg1,定义了线性地址 0..8M-1 和 3G..3G+8M-1 都映射到物理地址 0..8M-1。从此以后,我们可以不使用 "-__PAGE_OFFSET" 来访问内核符号,因为启用分页后,内核空间(位于线性地址 >=3G)将被正确地映射到其物理地址。

"lss stack_start,%esp" (SS:ESP = *stack_start)是第一个不使用 "-PAGE_OFFSET" 引用符号的例子,它设置了一个新的堆栈。对于 BSP,堆栈位于 init_task_union 的末尾。对于 AP,stack_start.esp 已被 linux/arch/i386/kernel/smpboot.c:do_boot_cpu() 重新定义为 第 8.2 节 中 "(void *) (1024 + PAGE_SIZE + (char *)idle)"。

有关分页机制和数据结构,请参阅 IA-32 手册 Vol.3。(第 3.7 章 使用 32 位物理地址的分页转换,第 9.8.3 章 初始化分页,第 9.9.1 章 切换到保护模式,以及第 18.26.3 章 启用和禁用分页)。

6.2. 获取内核参数

#define OLD_CL_MAGIC_ADDR       0x90020
#define OLD_CL_MAGIC            0xA33F
#define OLD_CL_BASE_ADDR        0x90000
#define OLD_CL_OFFSET           0x90022
#define NEW_CL_POINTER          0x228   /* Relative to real mode data */

#ifdef CONFIG_SMP
        if (BX) {
                EFLAGS = 0;             // AP clears EFLAGS
        } else
#endif
        {
                // Initial CPU cleans BSS
                clear BSS;              // i.e. __bss_start .. _end
                setup_idt() {
                        /* idt_table[256] defined in arch/i386/kernel/traps.c
                         *   located in section .data.idt
                        EAX = __KERNEL_CS << 16 + ignore_int;
                        DX = 0x8E00;    // interrupt gate, dpl = 0, present
                        idt_table[0..255] = {EAX, EDX};
                }
                EFLAGS = 0;
                /*
                 * Copy bootup parameters out of the way. First 2kB of
                 * _empty_zero_page is for boot parameters, second 2kB
                 * is for the command line.
                 */
                move *ESI (real-mode header) to empty_zero_page, 2KB;
                clear empty_zero_page+2K, 2KB;
                ESI = empty_zero_page[NEW_CL_POINTER];
                if (!ESI) {             // 32-bit command line pointer
                        if (OLD_CL_MAGIC==(uint16)[OLD_CL_MAGIC_ADDR]) {
                                ESI = [OLD_CL_BASE_ADDR]
                                      + (uint16)[OLD_CL_OFFSET];
                                move *ESI to empty_zero_page+2K, 2KB;
                        }
                } else {                // valid in 2.02+
                        move *ESI to empty_zero_page+2K, 2KB;
                }
        }
}
对于 BSP,内核参数从 ESI 指向的内存复制到 empty_zero_page。内核命令行(如果适用)将被复制到 empty_zero_page+2K

6.3. 检查 CPU 类型

有关如何识别处理器类型和处理器功能,请参阅 IA-32 手册 Vol.1。(第 13 章 处理器识别和功能确定)。

struct cpuinfo_x86;     // see include/asm-i386/processor.h
struct cpuinfo_x86 boot_cpu_data;       // see arch/i386/kernel/setup.c

#define CPU_PARAMS      SYMBOL_NAME(boot_cpu_data)
#define X86             CPU_PARAMS+0
#define X86_VENDOR      CPU_PARAMS+1
#define X86_MODEL       CPU_PARAMS+2
#define X86_MASK        CPU_PARAMS+3
#define X86_HARD_MATH   CPU_PARAMS+6
#define X86_CPUID       CPU_PARAMS+8
#define X86_CAPABILITY  CPU_PARAMS+12
#define X86_VENDOR_ID   CPU_PARAMS+28

checkCPUtype:
{
        X86_CPUID = -1;                 // no CPUID

        X86 = 3;                        // at least 386
        save original EFLAGS to ECX;
        flip AC bit (0x40000) in EFLAGS;
        if (AC bit not changed) goto is386;

        X86 = 4;                        // at least 486
        flip ID bit (0X200000) in EFLAGS;
        restore original EFLAGS;        // for AC & ID flags
        if (ID bit can not be changed) goto is486;

        // get CPU info
        CPUID(EAX=0);
        X86_CPUID = EAX;
        X86_VENDOR_ID = {EBX, EDX, ECX};
        if (!EAX) goto is486;

        CPUID(EAX=1);
        CL = AL;
        X86 = AH & 0x0f;                // family
        X86_MODEL = (AL & 0xf0) >> 4;   // model
        X86_MASK = CL & 0x0f;           // stepping id
        X86_CAPABILITY = EDX;           // feature

有关如何设置 x87 FPU,请参阅 IA-32 手册 Vol.3。(第 9.2 章 x87 FPU 初始化,以及第 18.14 章 x87 FPU)。

is486:
        // save PG, PE, ET and set AM, WP, NE, MP
        EAX = (CR0 & 0x80000011) | 0x50022;
        goto 2f;                        // skip "is386:" processing
is386:
        restore original EFLAGS from ECX;
        // save PG, PE, ET and set MP
        EAX = (CR0 & 0x80000011) | 0x02;

        /* ET: Extension Type (bit 4 of CR0).
         * In the Intel 386 and Intel 486 processors, this flag indicates
         * support of Intel 387 DX math coprocessor instructions when set.
         * In the Pentium 4, Intel Xeon, and P6 family processors,
         * this flag is hardcoded to 1.
         *     -- IA-32 Manual Vol.3. Ch.2.5. Control Registers (p.2-14) */

2:      CR0 = EAX;
        check_x87() {
                /* We depend on ET to be correct.
                 * This checks for 287/387. */
                X86_HARD_MATH = 0;
                clts;                   // CR0.TS = 0;
                fninit;                 // Init FPU;
                fstsw AX;               // AX = ST(0);
                if (AL) {
                        CR0 ^= 0x04;    // no coprocessor, set EM
                } else {
                        ALIGN
1:                      X86_HARD_MATH = 1;
                        /* IA-32 Manual Vol.3. Ch.18.14.7.14. FSETPM Instruction
                         * inform 287 that processor is in protected mode
                         * 287 only, ignored by 387 */
                        fsetpm;
                }
        }
}
宏 ALIGN,定义在linux/include/linux/linkage.h中,指定 16 字节对齐和填充值 0x90(NOP 的操作码)。另请参阅 Using as: 汇编器指令,了解指令 .align 的含义。

6.4. 进入启动内核

        ready:  .byte 0;        // global variable
{
        ready++;                // how many CPUs are ready
        lgdt gdt_descr;         // use new descriptor table in safe place
        lidt idt_descr;
        goto __KERNEL_CS:$1f;   // reload segment registers after "lgdt"
1:      DS = ES = FS = GS = __KERNEL_DS;
#ifdef CONFIG_SMP
        SS = __KERNEL_DS;       // reload segment only
#else
        SS:ESP = *stack_start;  /* end of init_task_union, defined
                                 *   in linux/arch/i386/kernel/init_task.c */
#endif
        EAX = 0;
        lldt AX;
        cld;

#ifdef CONFIG_SMP
        if (1!=ready) {         // not first CPU
                initialize_secondary();
                // see linux/arch/i386/kernel/smpboot.c
        } else
#endif
        {
                start_kernel(); // see linux/init/main.c
        }
L6:     goto L6;
}
第一个 CPU (BSP) 将调用 linux/init/main.c:start_kernel(),而其他 CPU (AP) 将调用 linux/arch/i386/kernel/smpboot.c:initialize_secondary()。请参阅 第 7 节 中的 start_kernel()第 8.4 节 中的 initialize_secondary()

init_task_union 恰好是第一个进程,“idle”进程 (pid=0) 的任务结构,其堆栈从 init_task_union 的尾部增长。以下是与 init_task_union 相关的代码
ENTRY(stack_start)
        .long init_task_union+8192;
        .long __KERNEL_DS;

#ifndef INIT_TASK_SIZE
# define INIT_TASK_SIZE 2048*sizeof(long)
#endif

union task_union {
        struct task_struct task;
        unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
};

/* INIT_TASK is used to set up the first task table, touch at
 * your own risk! Base=0, limit=0x1fffff (=2MB) */
union task_union init_task_union
        __attribute__((__section__(".data.init_task"))) =
                { INIT_TASK(init_task_union.task) };

init_task_union 用于 BSP “idle” 进程。不要将其与 “init” 进程混淆,后者将在 第 7.2 节 中提及。

6.5. 其他

///////////////////////////////////////////////////////////////////////////////
// default interrupt "handler"
ignore_int() { printk("Unknown interrupt\n"); iret; }

/*
 * The interrupt descriptor table has room for 256 idt's,
 * the global descriptor table is dependent on the number
 * of tasks we can have..
 */
#define IDT_ENTRIES     256
#define GDT_ENTRIES     (__TSS(NR_CPUS))

.globl SYMBOL_NAME(idt)
.globl SYMBOL_NAME(gdt)

        ALIGN
        .word 0
idt_descr:
        .word IDT_ENTRIES*8-1           # idt contains 256 entries
SYMBOL_NAME(idt):
        .long SYMBOL_NAME(idt_table)

        .word 0
gdt_descr:
        .word GDT_ENTRIES*8-1
SYMBOL_NAME(gdt):
        .long SYMBOL_NAME(gdt_table)

/*
 * This is initialized to create an identity-mapping at 0-8M (for bootup
 * purposes) and another mapping of the 0-8M area at virtual address
 * PAGE_OFFSET.
 */
.org 0x1000
ENTRY(swapper_pg_dir)   // "ENTRY" defined in linux/include/linux/linkage.h
        .long 0x00102007
        .long 0x00103007
        .fill BOOT_USER_PGD_PTRS-2,4,0
        /* default: 766 entries */
        .long 0x00102007
        .long 0x00103007
        /* default: 254 entries */
        .fill BOOT_KERNEL_PGD_PTRS-2,4,0

/*
 * The page tables are initialized to only 8MB here - the final page
 * tables are set up later depending on memory size.
 */
.org 0x2000
ENTRY(pg0)

.org 0x3000
ENTRY(pg1)

/*
 * empty_zero_page must immediately follow the page tables ! (The
 * initialization loop counts until empty_zero_page)
 */
.org 0x4000
ENTRY(empty_zero_page)

/*
 * Real beginning of normal "text" segment
 */
.org 0x5000
ENTRY(stext)
ENTRY(_stext)

///////////////////////////////////////////////////////////////////////////////
/*
 * This starts the data section. Note that the above is all
 * in the text section because it has alignment requirements
 * that we cannot fulfill any other way.
 */
.data

ALIGN
/*
 * This contains typically 140 quadwords, depending on NR_CPUS.
 *
 * NOTE! Make sure the gdt descriptor in head.S matches this if you
 * change anything.
 */
ENTRY(gdt_table)
        .quad 0x0000000000000000        /* NULL descriptor */
        .quad 0x0000000000000000        /* not used */
        .quad 0x00cf9a000000ffff        /* 0x10 kernel 4GB code at 0x00000000 */
        .quad 0x00cf92000000ffff        /* 0x18 kernel 4GB data at 0x00000000 */
        .quad 0x00cffa000000ffff        /* 0x23 user   4GB code at 0x00000000 */
        .quad 0x00cff2000000ffff        /* 0x2b user   4GB data at 0x00000000 */
        .quad 0x0000000000000000        /* not used */
        .quad 0x0000000000000000        /* not used */
        /*
         * The APM segments have byte granularity and their bases
         * and limits are set at run time.
         */
        .quad 0x0040920000000000        /* 0x40 APM set up for bad BIOS's */
        .quad 0x00409a0000000000        /* 0x48 APM CS    code */
        .quad 0x00009a0000000000        /* 0x50 APM CS 16 code (16 bit) */
        .quad 0x0040920000000000        /* 0x58 APM DS    data */
        .fill NR_CPUS*4,8,0             /* space for TSS's and LDT's */
idt_descrgdt_table 之前的宏 ALIGN 是为了性能考虑。

6.6. 参考