常驻内核镜像linux/vmlinux最终就位!它需要两个输入
ESI,指示 16 位实模式代码所在的位置,也称为 INITSEG<<4;
BX,指示哪个 CPU 正在运行,0 表示 BSP,其他值表示 AP。
ESI 指向来自 16 位实模式代码的参数区域,该区域稍后将被复制到 empty_zero_page。ESI 仅对 BSP 有效。
BSP(引导启动处理器)和 AP(应用处理器)是 Intel 术语。请查阅 IA-32 手册(Vol.3. 第 7.5 章 多处理器 (MP) 初始化)和 多处理器规范,了解 MP 初始化问题。
从软件的角度来看,在多处理器系统中,BSP 和 AP 共享物理内存,但使用各自的寄存器组。BSP 首先运行内核代码,设置操作系统执行环境,并触发 AP 也运行。AP 将处于休眠状态,直到 BSP 启动它。
.text /////////////////////////////////////////////////////////////////////////////// startup_32() { /* set segments to known values */ cld; DS = ES = FS = GS = __KERNEL_DS; #ifdef CONFIG_SMP #define cr4_bits mmu_cr4_features-__PAGE_OFFSET /* long mmu_cr4_features defined in linux/arch/i386/kernel/setup.c * __PAGE_OFFSET = 0xC0000000, i.e. 3G */ // AP with CR4 support (> Intel 486) will copy CR4 from BSP if (BX && cr4_bits) { // turn on paging options (PSE, PAE, ...) CR4 |= cr4_bits; } else #endif { /* only BSP initializes page tables (pg0..empty_zero_page-1) * pg0 at .org 0x2000 * empty_zero_page at .org 0x4000 * total (0x4000-0x2000)/4 = 0x0800 entries */ pg0 = { 0x00000007, // 7 = PRESENT + RW + USER 0x00001007, // 0x1000 = 4096 = 4K 0x00002007, ... pg1: 0x00400007, ... 0x007FF007 // total 8M empty_zero_page: }; } |
在linux/arch/i386/vmlinux.lds中,我们有
. = 0xC0000000 + 0x100000; _text = .; /* Text and read-only data */ .text : { *(.text) ... |
[root@localhost boot]# nm --defined /boot/vmlinux-2.4.20-28.9 | grep 'startup_32 \|mmu_cr4_features\|pg0\|\<empty_zero_page\>' | sort c0100000 t startup_32 c0102000 T pg0 c0104000 T empty_zero_page c0376404 B mmu_cr4_features |
mmu_cr4_features 在 .bss 节中,并且位于上述示例中的物理地址 0x376404。
在页表初始化后,可以启用分页。
// set page directory base pointer, physical address CR3 = swapper_pg_dir - __PAGE_OFFSET; // paging enabled! CR0 |= 0x80000000; // set PG bit goto 1f; // flush prefetch-queue 1: EAX = &1f; // address following the next instruction goto *(EAX); // relocate EIP 1: SS:ESP = *stack_start; |
"lss stack_start,%esp" (SS:ESP = *stack_start)是第一个不使用 "-PAGE_OFFSET" 引用符号的例子,它设置了一个新的堆栈。对于 BSP,堆栈位于 init_task_union 的末尾。对于 AP,stack_start.esp 已被 linux/arch/i386/kernel/smpboot.c:do_boot_cpu() 重新定义为 第 8.2 节 中 "(void *) (1024 + PAGE_SIZE + (char *)idle)"。
有关分页机制和数据结构,请参阅 IA-32 手册 Vol.3。(第 3.7 章 使用 32 位物理地址的分页转换,第 9.8.3 章 初始化分页,第 9.9.1 章 切换到保护模式,以及第 18.26.3 章 启用和禁用分页)。
#define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F #define OLD_CL_BASE_ADDR 0x90000 #define OLD_CL_OFFSET 0x90022 #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ #ifdef CONFIG_SMP if (BX) { EFLAGS = 0; // AP clears EFLAGS } else #endif { // Initial CPU cleans BSS clear BSS; // i.e. __bss_start .. _end setup_idt() { /* idt_table[256] defined in arch/i386/kernel/traps.c * located in section .data.idt EAX = __KERNEL_CS << 16 + ignore_int; DX = 0x8E00; // interrupt gate, dpl = 0, present idt_table[0..255] = {EAX, EDX}; } EFLAGS = 0; /* * Copy bootup parameters out of the way. First 2kB of * _empty_zero_page is for boot parameters, second 2kB * is for the command line. */ move *ESI (real-mode header) to empty_zero_page, 2KB; clear empty_zero_page+2K, 2KB; ESI = empty_zero_page[NEW_CL_POINTER]; if (!ESI) { // 32-bit command line pointer if (OLD_CL_MAGIC==(uint16)[OLD_CL_MAGIC_ADDR]) { ESI = [OLD_CL_BASE_ADDR] + (uint16)[OLD_CL_OFFSET]; move *ESI to empty_zero_page+2K, 2KB; } } else { // valid in 2.02+ move *ESI to empty_zero_page+2K, 2KB; } } } |
有关如何识别处理器类型和处理器功能,请参阅 IA-32 手册 Vol.1。(第 13 章 处理器识别和功能确定)。
struct cpuinfo_x86; // see include/asm-i386/processor.h struct cpuinfo_x86 boot_cpu_data; // see arch/i386/kernel/setup.c #define CPU_PARAMS SYMBOL_NAME(boot_cpu_data) #define X86 CPU_PARAMS+0 #define X86_VENDOR CPU_PARAMS+1 #define X86_MODEL CPU_PARAMS+2 #define X86_MASK CPU_PARAMS+3 #define X86_HARD_MATH CPU_PARAMS+6 #define X86_CPUID CPU_PARAMS+8 #define X86_CAPABILITY CPU_PARAMS+12 #define X86_VENDOR_ID CPU_PARAMS+28 checkCPUtype: { X86_CPUID = -1; // no CPUID X86 = 3; // at least 386 save original EFLAGS to ECX; flip AC bit (0x40000) in EFLAGS; if (AC bit not changed) goto is386; X86 = 4; // at least 486 flip ID bit (0X200000) in EFLAGS; restore original EFLAGS; // for AC & ID flags if (ID bit can not be changed) goto is486; // get CPU info CPUID(EAX=0); X86_CPUID = EAX; X86_VENDOR_ID = {EBX, EDX, ECX}; if (!EAX) goto is486; CPUID(EAX=1); CL = AL; X86 = AH & 0x0f; // family X86_MODEL = (AL & 0xf0) >> 4; // model X86_MASK = CL & 0x0f; // stepping id X86_CAPABILITY = EDX; // feature |
有关如何设置 x87 FPU,请参阅 IA-32 手册 Vol.3。(第 9.2 章 x87 FPU 初始化,以及第 18.14 章 x87 FPU)。
is486: // save PG, PE, ET and set AM, WP, NE, MP EAX = (CR0 & 0x80000011) | 0x50022; goto 2f; // skip "is386:" processing is386: restore original EFLAGS from ECX; // save PG, PE, ET and set MP EAX = (CR0 & 0x80000011) | 0x02; /* ET: Extension Type (bit 4 of CR0). * In the Intel 386 and Intel 486 processors, this flag indicates * support of Intel 387 DX math coprocessor instructions when set. * In the Pentium 4, Intel Xeon, and P6 family processors, * this flag is hardcoded to 1. * -- IA-32 Manual Vol.3. Ch.2.5. Control Registers (p.2-14) */ 2: CR0 = EAX; check_x87() { /* We depend on ET to be correct. * This checks for 287/387. */ X86_HARD_MATH = 0; clts; // CR0.TS = 0; fninit; // Init FPU; fstsw AX; // AX = ST(0); if (AL) { CR0 ^= 0x04; // no coprocessor, set EM } else { ALIGN 1: X86_HARD_MATH = 1; /* IA-32 Manual Vol.3. Ch.18.14.7.14. FSETPM Instruction * inform 287 that processor is in protected mode * 287 only, ignored by 387 */ fsetpm; } } } |
ready: .byte 0; // global variable { ready++; // how many CPUs are ready lgdt gdt_descr; // use new descriptor table in safe place lidt idt_descr; goto __KERNEL_CS:$1f; // reload segment registers after "lgdt" 1: DS = ES = FS = GS = __KERNEL_DS; #ifdef CONFIG_SMP SS = __KERNEL_DS; // reload segment only #else SS:ESP = *stack_start; /* end of init_task_union, defined * in linux/arch/i386/kernel/init_task.c */ #endif EAX = 0; lldt AX; cld; #ifdef CONFIG_SMP if (1!=ready) { // not first CPU initialize_secondary(); // see linux/arch/i386/kernel/smpboot.c } else #endif { start_kernel(); // see linux/init/main.c } L6: goto L6; } |
init_task_union 恰好是第一个进程,“idle”进程 (pid=0) 的任务结构,其堆栈从 init_task_union 的尾部增长。以下是与 init_task_union 相关的代码
ENTRY(stack_start) .long init_task_union+8192; .long __KERNEL_DS; #ifndef INIT_TASK_SIZE # define INIT_TASK_SIZE 2048*sizeof(long) #endif union task_union { struct task_struct task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; /* INIT_TASK is used to set up the first task table, touch at * your own risk! Base=0, limit=0x1fffff (=2MB) */ union task_union init_task_union __attribute__((__section__(".data.init_task"))) = { INIT_TASK(init_task_union.task) }; |
init_task_union 用于 BSP “idle” 进程。不要将其与 “init” 进程混淆,后者将在 第 7.2 节 中提及。
/////////////////////////////////////////////////////////////////////////////// // default interrupt "handler" ignore_int() { printk("Unknown interrupt\n"); iret; } /* * The interrupt descriptor table has room for 256 idt's, * the global descriptor table is dependent on the number * of tasks we can have.. */ #define IDT_ENTRIES 256 #define GDT_ENTRIES (__TSS(NR_CPUS)) .globl SYMBOL_NAME(idt) .globl SYMBOL_NAME(gdt) ALIGN .word 0 idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries SYMBOL_NAME(idt): .long SYMBOL_NAME(idt_table) .word 0 gdt_descr: .word GDT_ENTRIES*8-1 SYMBOL_NAME(gdt): .long SYMBOL_NAME(gdt_table) /* * This is initialized to create an identity-mapping at 0-8M (for bootup * purposes) and another mapping of the 0-8M area at virtual address * PAGE_OFFSET. */ .org 0x1000 ENTRY(swapper_pg_dir) // "ENTRY" defined in linux/include/linux/linkage.h .long 0x00102007 .long 0x00103007 .fill BOOT_USER_PGD_PTRS-2,4,0 /* default: 766 entries */ .long 0x00102007 .long 0x00103007 /* default: 254 entries */ .fill BOOT_KERNEL_PGD_PTRS-2,4,0 /* * The page tables are initialized to only 8MB here - the final page * tables are set up later depending on memory size. */ .org 0x2000 ENTRY(pg0) .org 0x3000 ENTRY(pg1) /* * empty_zero_page must immediately follow the page tables ! (The * initialization loop counts until empty_zero_page) */ .org 0x4000 ENTRY(empty_zero_page) /* * Real beginning of normal "text" segment */ .org 0x5000 ENTRY(stext) ENTRY(_stext) /////////////////////////////////////////////////////////////////////////////// /* * This starts the data section. Note that the above is all * in the text section because it has alignment requirements * that we cannot fulfill any other way. */ .data ALIGN /* * This contains typically 140 quadwords, depending on NR_CPUS. * * NOTE! Make sure the gdt descriptor in head.S matches this if you * change anything. */ ENTRY(gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* not used */ .quad 0x00cf9a000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ .quad 0x0000000000000000 /* not used */ .quad 0x0000000000000000 /* not used */ /* * The APM segments have byte granularity and their bases * and limits are set at run time. */ .quad 0x0040920000000000 /* 0x40 APM set up for bad BIOS's */ .quad 0x00409a0000000000 /* 0x48 APM CS code */ .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0x58 APM DS data */ .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ |