常驻内核镜像linux/vmlinux最终就位!它需要两个输入
ESI,指示 16 位实模式代码所在的位置,也称为 INITSEG<<4;
BX,指示哪个 CPU 正在运行,0 表示 BSP,其他值表示 AP。
ESI 指向来自 16 位实模式代码的参数区域,该区域稍后将被复制到 empty_zero_page。ESI 仅对 BSP 有效。
BSP(引导启动处理器)和 AP(应用处理器)是 Intel 术语。请查阅 IA-32 手册(Vol.3. 第 7.5 章 多处理器 (MP) 初始化)和 多处理器规范,了解 MP 初始化问题。
从软件的角度来看,在多处理器系统中,BSP 和 AP 共享物理内存,但使用各自的寄存器组。BSP 首先运行内核代码,设置操作系统执行环境,并触发 AP 也运行。AP 将处于休眠状态,直到 BSP 启动它。
.text
///////////////////////////////////////////////////////////////////////////////
startup_32()
{
/* set segments to known values */
cld;
DS = ES = FS = GS = __KERNEL_DS;
#ifdef CONFIG_SMP
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
/* long mmu_cr4_features defined in linux/arch/i386/kernel/setup.c
* __PAGE_OFFSET = 0xC0000000, i.e. 3G */
// AP with CR4 support (> Intel 486) will copy CR4 from BSP
if (BX && cr4_bits) {
// turn on paging options (PSE, PAE, ...)
CR4 |= cr4_bits;
} else
#endif
{
/* only BSP initializes page tables (pg0..empty_zero_page-1)
* pg0 at .org 0x2000
* empty_zero_page at .org 0x4000
* total (0x4000-0x2000)/4 = 0x0800 entries */
pg0 = {
0x00000007, // 7 = PRESENT + RW + USER
0x00001007, // 0x1000 = 4096 = 4K
0x00002007,
...
pg1: 0x00400007,
...
0x007FF007 // total 8M
empty_zero_page:
};
} |
在linux/arch/i386/vmlinux.lds中,我们有
. = 0xC0000000 + 0x100000;
_text = .; /* Text and read-only data */
.text : {
*(.text)
... |
[root@localhost boot]# nm --defined /boot/vmlinux-2.4.20-28.9 | grep 'startup_32 \|mmu_cr4_features\|pg0\|\<empty_zero_page\>' | sort c0100000 t startup_32 c0102000 T pg0 c0104000 T empty_zero_page c0376404 B mmu_cr4_features |
mmu_cr4_features 在 .bss 节中,并且位于上述示例中的物理地址 0x376404。
在页表初始化后,可以启用分页。
// set page directory base pointer, physical address
CR3 = swapper_pg_dir - __PAGE_OFFSET;
// paging enabled!
CR0 |= 0x80000000; // set PG bit
goto 1f; // flush prefetch-queue
1:
EAX = &1f; // address following the next instruction
goto *(EAX); // relocate EIP
1:
SS:ESP = *stack_start; |
"lss stack_start,%esp" (SS:ESP = *stack_start)是第一个不使用 "-PAGE_OFFSET" 引用符号的例子,它设置了一个新的堆栈。对于 BSP,堆栈位于 init_task_union 的末尾。对于 AP,stack_start.esp 已被 linux/arch/i386/kernel/smpboot.c:do_boot_cpu() 重新定义为 第 8.2 节 中 "(void *) (1024 + PAGE_SIZE + (char *)idle)"。
有关分页机制和数据结构,请参阅 IA-32 手册 Vol.3。(第 3.7 章 使用 32 位物理地址的分页转换,第 9.8.3 章 初始化分页,第 9.9.1 章 切换到保护模式,以及第 18.26.3 章 启用和禁用分页)。
#define OLD_CL_MAGIC_ADDR 0x90020
#define OLD_CL_MAGIC 0xA33F
#define OLD_CL_BASE_ADDR 0x90000
#define OLD_CL_OFFSET 0x90022
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
#ifdef CONFIG_SMP
if (BX) {
EFLAGS = 0; // AP clears EFLAGS
} else
#endif
{
// Initial CPU cleans BSS
clear BSS; // i.e. __bss_start .. _end
setup_idt() {
/* idt_table[256] defined in arch/i386/kernel/traps.c
* located in section .data.idt
EAX = __KERNEL_CS << 16 + ignore_int;
DX = 0x8E00; // interrupt gate, dpl = 0, present
idt_table[0..255] = {EAX, EDX};
}
EFLAGS = 0;
/*
* Copy bootup parameters out of the way. First 2kB of
* _empty_zero_page is for boot parameters, second 2kB
* is for the command line.
*/
move *ESI (real-mode header) to empty_zero_page, 2KB;
clear empty_zero_page+2K, 2KB;
ESI = empty_zero_page[NEW_CL_POINTER];
if (!ESI) { // 32-bit command line pointer
if (OLD_CL_MAGIC==(uint16)[OLD_CL_MAGIC_ADDR]) {
ESI = [OLD_CL_BASE_ADDR]
+ (uint16)[OLD_CL_OFFSET];
move *ESI to empty_zero_page+2K, 2KB;
}
} else { // valid in 2.02+
move *ESI to empty_zero_page+2K, 2KB;
}
}
} |
有关如何识别处理器类型和处理器功能,请参阅 IA-32 手册 Vol.1。(第 13 章 处理器识别和功能确定)。
struct cpuinfo_x86; // see include/asm-i386/processor.h
struct cpuinfo_x86 boot_cpu_data; // see arch/i386/kernel/setup.c
#define CPU_PARAMS SYMBOL_NAME(boot_cpu_data)
#define X86 CPU_PARAMS+0
#define X86_VENDOR CPU_PARAMS+1
#define X86_MODEL CPU_PARAMS+2
#define X86_MASK CPU_PARAMS+3
#define X86_HARD_MATH CPU_PARAMS+6
#define X86_CPUID CPU_PARAMS+8
#define X86_CAPABILITY CPU_PARAMS+12
#define X86_VENDOR_ID CPU_PARAMS+28
checkCPUtype:
{
X86_CPUID = -1; // no CPUID
X86 = 3; // at least 386
save original EFLAGS to ECX;
flip AC bit (0x40000) in EFLAGS;
if (AC bit not changed) goto is386;
X86 = 4; // at least 486
flip ID bit (0X200000) in EFLAGS;
restore original EFLAGS; // for AC & ID flags
if (ID bit can not be changed) goto is486;
// get CPU info
CPUID(EAX=0);
X86_CPUID = EAX;
X86_VENDOR_ID = {EBX, EDX, ECX};
if (!EAX) goto is486;
CPUID(EAX=1);
CL = AL;
X86 = AH & 0x0f; // family
X86_MODEL = (AL & 0xf0) >> 4; // model
X86_MASK = CL & 0x0f; // stepping id
X86_CAPABILITY = EDX; // feature |
有关如何设置 x87 FPU,请参阅 IA-32 手册 Vol.3。(第 9.2 章 x87 FPU 初始化,以及第 18.14 章 x87 FPU)。
is486:
// save PG, PE, ET and set AM, WP, NE, MP
EAX = (CR0 & 0x80000011) | 0x50022;
goto 2f; // skip "is386:" processing
is386:
restore original EFLAGS from ECX;
// save PG, PE, ET and set MP
EAX = (CR0 & 0x80000011) | 0x02;
/* ET: Extension Type (bit 4 of CR0).
* In the Intel 386 and Intel 486 processors, this flag indicates
* support of Intel 387 DX math coprocessor instructions when set.
* In the Pentium 4, Intel Xeon, and P6 family processors,
* this flag is hardcoded to 1.
* -- IA-32 Manual Vol.3. Ch.2.5. Control Registers (p.2-14) */
2: CR0 = EAX;
check_x87() {
/* We depend on ET to be correct.
* This checks for 287/387. */
X86_HARD_MATH = 0;
clts; // CR0.TS = 0;
fninit; // Init FPU;
fstsw AX; // AX = ST(0);
if (AL) {
CR0 ^= 0x04; // no coprocessor, set EM
} else {
ALIGN
1: X86_HARD_MATH = 1;
/* IA-32 Manual Vol.3. Ch.18.14.7.14. FSETPM Instruction
* inform 287 that processor is in protected mode
* 287 only, ignored by 387 */
fsetpm;
}
}
} |
ready: .byte 0; // global variable
{
ready++; // how many CPUs are ready
lgdt gdt_descr; // use new descriptor table in safe place
lidt idt_descr;
goto __KERNEL_CS:$1f; // reload segment registers after "lgdt"
1: DS = ES = FS = GS = __KERNEL_DS;
#ifdef CONFIG_SMP
SS = __KERNEL_DS; // reload segment only
#else
SS:ESP = *stack_start; /* end of init_task_union, defined
* in linux/arch/i386/kernel/init_task.c */
#endif
EAX = 0;
lldt AX;
cld;
#ifdef CONFIG_SMP
if (1!=ready) { // not first CPU
initialize_secondary();
// see linux/arch/i386/kernel/smpboot.c
} else
#endif
{
start_kernel(); // see linux/init/main.c
}
L6: goto L6;
} |
init_task_union 恰好是第一个进程,“idle”进程 (pid=0) 的任务结构,其堆栈从 init_task_union 的尾部增长。以下是与 init_task_union 相关的代码
ENTRY(stack_start)
.long init_task_union+8192;
.long __KERNEL_DS;
#ifndef INIT_TASK_SIZE
# define INIT_TASK_SIZE 2048*sizeof(long)
#endif
union task_union {
struct task_struct task;
unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
};
/* INIT_TASK is used to set up the first task table, touch at
* your own risk! Base=0, limit=0x1fffff (=2MB) */
union task_union init_task_union
__attribute__((__section__(".data.init_task"))) =
{ INIT_TASK(init_task_union.task) }; |
init_task_union 用于 BSP “idle” 进程。不要将其与 “init” 进程混淆,后者将在 第 7.2 节 中提及。
///////////////////////////////////////////////////////////////////////////////
// default interrupt "handler"
ignore_int() { printk("Unknown interrupt\n"); iret; }
/*
* The interrupt descriptor table has room for 256 idt's,
* the global descriptor table is dependent on the number
* of tasks we can have..
*/
#define IDT_ENTRIES 256
#define GDT_ENTRIES (__TSS(NR_CPUS))
.globl SYMBOL_NAME(idt)
.globl SYMBOL_NAME(gdt)
ALIGN
.word 0
idt_descr:
.word IDT_ENTRIES*8-1 # idt contains 256 entries
SYMBOL_NAME(idt):
.long SYMBOL_NAME(idt_table)
.word 0
gdt_descr:
.word GDT_ENTRIES*8-1
SYMBOL_NAME(gdt):
.long SYMBOL_NAME(gdt_table)
/*
* This is initialized to create an identity-mapping at 0-8M (for bootup
* purposes) and another mapping of the 0-8M area at virtual address
* PAGE_OFFSET.
*/
.org 0x1000
ENTRY(swapper_pg_dir) // "ENTRY" defined in linux/include/linux/linkage.h
.long 0x00102007
.long 0x00103007
.fill BOOT_USER_PGD_PTRS-2,4,0
/* default: 766 entries */
.long 0x00102007
.long 0x00103007
/* default: 254 entries */
.fill BOOT_KERNEL_PGD_PTRS-2,4,0
/*
* The page tables are initialized to only 8MB here - the final page
* tables are set up later depending on memory size.
*/
.org 0x2000
ENTRY(pg0)
.org 0x3000
ENTRY(pg1)
/*
* empty_zero_page must immediately follow the page tables ! (The
* initialization loop counts until empty_zero_page)
*/
.org 0x4000
ENTRY(empty_zero_page)
/*
* Real beginning of normal "text" segment
*/
.org 0x5000
ENTRY(stext)
ENTRY(_stext)
///////////////////////////////////////////////////////////////////////////////
/*
* This starts the data section. Note that the above is all
* in the text section because it has alignment requirements
* that we cannot fulfill any other way.
*/
.data
ALIGN
/*
* This contains typically 140 quadwords, depending on NR_CPUS.
*
* NOTE! Make sure the gdt descriptor in head.S matches this if you
* change anything.
*/
ENTRY(gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x0000000000000000 /* not used */
.quad 0x00cf9a000000ffff /* 0x10 kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* 0x18 kernel 4GB data at 0x00000000 */
.quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */
.quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */
.quad 0x0000000000000000 /* not used */
.quad 0x0000000000000000 /* not used */
/*
* The APM segments have byte granularity and their bases
* and limits are set at run time.
*/
.quad 0x0040920000000000 /* 0x40 APM set up for bad BIOS's */
.quad 0x00409a0000000000 /* 0x48 APM CS code */
.quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */
.quad 0x0040920000000000 /* 0x58 APM DS data */
.fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ |