setup.S负责从 BIOS 获取系统数据,并将它们放入系统内存中的适当位置。
其他引导加载程序,如 GNU GRUB 和 LILO,也可以加载 bzImage。这些引导加载程序应将 bzImage 加载到内存中,并设置“实模式内核头”,特别是 type_of_loader,然后将控制权直接传递给 bsetup。setup.S假设
bsetup 或 setup 可能未加载到 SETUPSEG:0,即当控制权传递给 时,CS 可能不等于 SETUPSEGsetup.S;
setup 的前 4 个扇区紧跟在 bootsect 之后加载。其余部分可以加载到 SYSSEG:0,位于 vmlinux 之前;此假设不适用于 bsetup。
/* Signature words to ensure LILO loaded us right */ #define SIG1 0xAA55 #define SIG2 0x5A5A INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment # ... and the former contents of CS DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 .code16 .text /////////////////////////////////////////////////////////////////////////////// start: { goto trampoline(); // skip the following header } # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature .word 0x0203 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG .word kernel_version # pointing to kernel version string # above section of header is compatible # with loadlin-1.5 (header v1.5). Don't # change it. // kernel_version defined below type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, # Bootlin, SYSLX, bootsect...) # See Documentation/i386/boot.txt for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags loadflags: LOADED_HIGH = 1 # If set, the kernel is loaded high CAN_USE_HEAP = 0x80 # If set, the loader also has set # heap_end_ptr to tell how much # space behind setup.S can be used for # heap purposes. # Only the loader knows what is free #ifndef __BIG_KERNEL__ .byte 0 #else .byte LOADED_HIGH #endif setup_move_size: .word 0x8000 # size to move, when setup is not # loaded at 0x90000. We will move setup # to 0x90000 then just before jumping # into the kernel. However, only the # loader knows how much data behind # us also needs to be loaded. code32_start: # here loaders can put a different # start address for 32-bit code. #ifndef __BIG_KERNEL__ .long 0x1000 # 0x1000 = default for zImage #else .long 0x100000 # 0x100000 = default for big kernel #endif ramdisk_image: .long 0 # address of loaded ramdisk image # Here the loader puts the 32-bit # address where it loaded the image. # This only will be read by the kernel. ramdisk_size: .long 0 # its size in bytes bootsect_kludge: .word bootsect_helper, SETUPSEG heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) # space from here (exclusive) down to # end of setup code can be used by setup # for local heap purposes. // modelist is at the end of .text section pad1: .word 0 cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # If nonzero, a 32-bit pointer # to the kernel command line. # The command line should be # located between the start of # setup and the end of low # memory (0xa0000), or it may # get overwritten before it # gets read. If this field is # used, there is no longer # anything magical about the # 0x90000 segment; the setup # can be located anywhere in # low memory 0x10000 or higher. ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd |
__MAXMEM 定义在linux/asm-i386/page.h:
/* * A __PAGE_OFFSET of 0xC0000000 means that the kernel has * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. */ #define __PAGE_OFFSET (0xC0000000) /* * This much address space is reserved for vmalloc() and iomap() * as well as fixmap mappings. */ #define __VMALLOC_RESERVE (128 << 20) #define __MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) |
setup 头部必须遵循一定的布局模式。请参考linux/Documentation/i386/boot.txt:
Offset Proto Name Meaning /Size 0200/2 2.00+ jump Jump instruction 0202/4 2.00+ header Magic signature "HdrS" 0206/2 2.00+ version Boot protocol version supported 0208/4 2.00+ realmode_swtch Boot loader hook 020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) 020E/2 2.00+ kernel_version Pointer to kernel version string 0210/1 2.00+ type_of_loader Boot loader identifier 0211/1 2.00+ loadflags Boot protocol option flags 0212/2 2.00+ setup_move_size Move to high memory size (used with hooks) 0214/4 2.00+ code32_start Boot loader hook 0218/4 2.00+ ramdisk_image initrd load address (set by boot loader) 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 0224/2 2.01+ heap_end_ptr Free memory after setup end 0226/2 N/A pad1 Unused 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 022C/4 2.03+ initrd_addr_max Highest legal initrd address |
由于 setup 代码可能不是连续的,我们应该首先检查代码完整性。
/////////////////////////////////////////////////////////////////////////////// trampoline() { start_of_setup(); // never return .space 1024; } /////////////////////////////////////////////////////////////////////////////// // check signature to see if all code loaded start_of_setup() { // Bootlin depends on this being done early, check bootlin:technic.doc int13/AH=15h(AL=0, DL=0x81); // int13/AH=15h: DISK - GET DISK TYPE #ifdef SAFE_RESET_DISK_CONTROLLER int13/AH=0(AL=0, DL=0x80); // int13/AH=00h: DISK - RESET DISK SYSTEM #endif DS = CS; // check signature at end of setup if (setup_sig1!=SIG1 || setup_sig2!=SIG2) { goto bad_sig; } goto goodsig1; } /////////////////////////////////////////////////////////////////////////////// // some small functions prtstr(); /* print asciiz string at DS:SI */ prtsp2(); /* print double space */ prtspc(); /* print single space */ prtchr(); /* print ascii in AL */ beep(); /* print CTRL-G, i.e. beep */ |
如果未找到签名,则其余 setup 代码可能位于 SYSSEG:0 的 vmlinux 之前。
no_sig_mess: .string "No setup signature found ..." goodsig1: goto goodsig; // make near jump /////////////////////////////////////////////////////////////////////////////// // move the rest setup code from SYSSEG:0 to CS:0800 bad_sig() DELTA_INITSEG = 0x0020 (= SETUPSEG - INITSEG) SYSSEG = 0x1000 word start_sys_seg = SYSSEG; // defined in setup header { DS = CS - DELTA_INITSEG; // aka INITSEG BX = (byte)(DS:[497]); // i.e. setup_sects // first 4 sectors already loaded CX = (BX - 4) << 8; // rest code in word (2-bytes) start_sys_seg = (CX >> 3) + SYSSEG; // real system code start move SYSSEG:0 to CS:0800 (CX*2 bytes); if (setup_sig1!=SIG1 || setup_sig2!=SIG2) { no_sig: prtstr("No setup signature found ..."); no_sig_loop: hlt; goto no_sig_loop; } } |
setup 代码已移动到正确的位置。变量 start_sys_seg 指向实际系统代码的起始位置。如果未发生 "bad_sig",start_sys_seg 仍为 SYSSEG。
检查加载程序是否与镜像兼容。
/////////////////////////////////////////////////////////////////////////////// good_sig() char loadflags; // in setup header char type_of_loader; // in setup header LOADHIGH = 1 { DS = CS - DELTA_INITSEG; // aka INITSEG if ( (loadflags & LOADHIGH) && !type_of_loader ) { // Nope, old loader tries to load big-kernel prtstr("Wrong loader, giving up..."); goto no_sig_loop; // defined above in bad_sig() } } loader_panic_mess: .string "Wrong loader, giving up..." |
尝试三种不同的内存检测方案,以 KB 为单位获取扩展内存大小(大于 1M)。
首先,尝试 e820h,它可以让我们组装内存映射;然后尝试 e801h,它返回 32 位内存大小;最后是 88h,它返回 0-64M。
/////////////////////////////////////////////////////////////////////////////// // get memory size loader_ok() E820NR = 0x1E8 E820MAP = 0x2D0 { // when entering this function, DS = CS-DELTA_INITSEG, aka INITSEG (long)DS:[0x1E0] = 0; #ifndef STANDARD_MEMORY_BIOS_CALL (byte)DS:[0x1E8] = 0; // E820NR /* method E820H: see ACPI spec * the memory map from hell. e820h returns memory classified into * a whole bunch of different types, and allows memory holes and * everything. We scan through this memory map and build a list * of the first 32 memory areas, which we return at [E820MAP]. */ meme820: EBX = 0; DI = 0x02D0; // E820MAP do { jmpe820: int15/EAX=E820h(EDX='SMAP', EBX, ECX=20, ES:DI=DS:DI); // int15/AX=E820h: GET SYSTEM MEMORY MAP if (failed || 'SMAP'!=EAX) break; // if (1!=DS:[DI+16]) continue; // not usable good820: if (DS:[1E8]>=32) break; // entry# > E820MAX DS:[0x1E8]++; // entry# ++; DI += 20; // adjust buffer for next again820: } while (!EBX) // not finished bail820: /* method E801H: * memory size is in 1k chunksizes, to avoid confusing loadlin. * we store the 0xe801 memory size in a completely different place, * because it will most likely be longer than 16 bits. * (use 1e0 because that's what Larry Augustine uses in his * alternative new memory detection scheme, and it's sensible * to write everything into the same place.) */ meme801: stc; // to work around buggy BIOSes CX = DX = 0; int15/AX=E801h; /* int15/AX=E801h: GET MEMORY SIZE FOR >64M CONFIGURATIONS * AX = extended memory between 1M and 16M, in K (max 3C00 = 15MB) * BX = extended memory above 16M, in 64K blocks * CX = configured memory 1M to 16M, in K * DX = configured memory above 16M, in 64K blocks */ if (failed) goto mem88; if (!CX && !DX) { CX = AX; DX = BX; } e801usecxdx: (long)DS:[0x1E0] = ((EDX & 0xFFFF) << 6) + (ECX & 0xFFFF); // in K #endif mem88: // old traditional method int15/AH=88h; /* int15/AH=88h: SYSTEM - GET EXTENDED MEMORY SIZE * AX = number of contiguous KB starting at absolute address 100000h */ DS:[2] = AX; } |
检查硬件支持,如键盘、视频适配器、硬盘、MCA 总线和 pointing device。
{ // set the keyboard repeat rate to the max int16/AX=0305h(BX=0); // int16/AH=03h: KEYBOARD - SET TYPEMATIC RATE AND DELAY /* Check for video adapter and its parameters and * allow the user to browse video modes. */ video(); // see video.S // get hd0 and hd1 data copy hd0 data (*int41) to CS-DELTA_INITSEG:0080 (16 bytes); // int41: SYSTEM DATA - HARD DISK 0 PARAMETER TABLE ADDRESS copy hd1 data (*int46) to CS-DELTA_INITSEG:0090 (16 bytes); // int46: SYSTEM DATA - HARD DISK 1 PARAMETER TABLE ADDRESS // check if hd1 exists int13/AH=15h(AL=0, DL=0x81); // int13/AH=15h: DISK - GET DISK TYPE if (failed || AH!=03h) { // AH==03h if it is a hard disk no_disk1: clear CS-DELTA_INITSEG:0090 (16 bytes); } is_disk1: // check for Micro Channel (MCA) bus CS-DELTA_INITSEG:[0xA0] = 0; // set table length to 0 int15/AH=C0h; /* int15/AH=C0h: SYSTEM - GET CONFIGURATION * ES:BX = ROM configuration table */ if (failed) goto no_mca; move ROM configuration table (ES:BX) to CS-DELTA_INITSEG:00A0; // CX = (table length<14)? CX:16; first 16 bytes only no_mca: // check for PS/2 pointing device CS-DELTA_INITSEG:[0x1FF] = 0; // default is no pointing device int11h(); // int11h: BIOS - GET EQUIPMENT LIST if (AL & 0x04) { // mouse installed DS:[0x1FF] = 0xAA; } } |
检查 BIOS APM 支持。
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) { DS:[0x40] = 0; // version = 0 means no APM BIOS int15/AX=5300h(BX=0); // int15/AX=5300h: Advanced Power Management v1.0+ - INSTALLATION CHECK if (failed || 'PM'!=BX || !(CX & 0x02)) goto done_apm_bios; // (CX & 0x02) means 32 bit is supported int15/AX=5304h(BX=0); // int15/AX=5304h: Advanced Power Management v1.0+ - DISCONNECT INTERFACE EBX = CX = DX = ESI = DI = 0; int15/AX=5303h(BX=0); /* int15/AX=5303h: Advanced Power Management v1.0+ * - CONNECT 32-BIT PROTMODE INTERFACE */ if (failed) { no_32_apm_bios: // I moved label no_32_apm_bios here DS:[0x4C] &= ~0x0002; // remove 32 bit support bit goto done_apm_bios; } DS:[0x42] = AX, 32-bit code segment base address; DS:[0x44] = EBX, offset of entry point; DS:[0x48] = CX, 16-bit code segment base address; DS:[0x4A] = DX, 16-bit data segment base address; DS:[0x4E] = ESI, APM BIOS code segment length; DS:[0x52] = DI, APM BIOS data segment length; int15/AX=5300h(BX=0); // check again // int15/AX=5300h: Advanced Power Management v1.0+ - INSTALLATION CHECK if (success && 'PM'==BX) { DS:[0x40] = AX, APM version; DS:[0x4C] = CX, APM flags; } else { apm_disconnect: int15/AX=5304h(BX=0); /* int15/AX=5304h: Advanced Power Management v1.0+ * - DISCONNECT INTERFACE */ } done_apm_bios: } #endif |
// call mode switch { if (realmode_swtch) { realmode_swtch(); // mode switch hook } else { rmodeswtch_normal: default_switch() { cli; // no interrupts allowed outb(0x80, 0x70); // disable NMI } } rmodeswtch_end: } // relocate code if necessary { (long)code32 = code32_start; if (!(loadflags & LOADED_HIGH)) { // low loaded zImage // 0x0100 <= start_sys_seg < CS-DELTA_INITSEG do_move0: AX = 0x100; BP = CS - DELTA_INITSEG; // aka INITSEG BX = start_sys_seg; do_move: move system image from (start_sys_seg:0 .. CS-DELTA_INITSEG:0) to 0100:0; // move 0x1000 bytes each time } end_move: |
然后,如有必要,它会将代码从 CS-DELTA_INITSEG:0 (bbootsect 和 bsetup) 重定位到 INITSEG:0。
DS = CS; // aka SETUPSEG // Check whether we need to be downward compatible with version <=201 if (!cmd_line_ptr && 0x20!=type_of_loader && SETUPSEG!=CS) { cli; // as interrupt may use stack when we are moving // store new SS in DX AX = CS - DELTA_INITSEG; DX = SS; if (DX>=AX) { // stack frame will be moved together DX = DX + INITSEG - AX; // i.e. SS-CS+SETUPSEG } move_self_1: /* move CS-DELTA_INITSEG:0 to INITSEG:0 (setup_move_size bytes) * in two steps in order not to overwrite code on CS:IP * move up (src < dest) but downward ("std") */ move CS-DELTA_INITSEG:move_self_here+0x200 to INITSEG:move_self_here+0x200, setup_move_size-(move_self_here+0x200) bytes; // INITSEG:move_self_here+0x200 == SETUPSEG:move_self_here goto SETUPSEG:move_self_here; // CS=SETUPSEG now move_self_here: move CS-DELTA_INITSEG:0 to INITSEG:0, move_self_here+0x200 bytes; // I mean old CS before goto DS = SETUPSEG; SS = DX; } end_move_self: } |
有关 A20 问题和解决方案,请参阅 A20 - 过去的痛苦。
A20_TEST_LOOPS = 32 # Iterations per wait A20_ENABLE_LOOPS = 255 # Total loops to try { #if defined(CONFIG_MELAN) // Enable A20. AMD Elan bug fix. outb(0x02, 0x92); // outb(val, port) a20_elan_wait: while (!a20_test()); // test not passed goto a20_done; #endif a20_try_loop: // First, see if we are on a system with no A20 gate. a20_none: if (a20_test()) goto a20_done; // test passed // Next, try the BIOS (INT 0x15, AX=0x2401) a20_bios: int15/AX=2401h; // Int15/AX=2401h: SYSTEM - later PS/2s - ENABLE A20 GATE if (a20_test()) goto a20_done; // test passed // Try enabling A20 through the keyboard controller a20_kbc: empty_8042(); if (a20_test()) goto a20_done; // test again in case BIOS delayed outb(0xD1, 0x64); // command write empty_8042(); outb(0xDF, 0x60); // A20 on empty_8042(); // wait until a20 really *is* enabled a20_kbc_wait: CX = 0; a20_kbc_wait_loop: do { if (a20_test()) goto a20_done; // test passed } while (--CX) // Final attempt: use "configuration port A" outb((inb(0x92) | 0x02) & 0xFE, 0x92); // wait for configuration port A to take effect a20_fast_wait: CX = 0; a20_fast_wait_loop: do { if (a20_test()) goto a20_done; // test passed } while (--CX) // A20 is still not responding. Try frobbing it again. if (--a20_tries) goto a20_try_loop; prtstr("linux: fatal error: A20 gate not responding!"); a20_die: hlt; goto a20_die; } a20_tries: .byte A20_ENABLE_LOOPS // i.e. 255 a20_err_msg: .ascii "linux: fatal error: A20 gate not responding!" .byte 13, 10, 0 |
为了确保代码与所有 32 位 IA-32 处理器的兼容性,请执行以下步骤切换到保护模式
准备 GDT,在第一个 GDT 条目中包含一个空描述符,一个代码段描述符和一个数据段描述符;
禁用中断,包括可屏蔽硬件中断和 NMI;
使用 "lgdt" 指令将 GDT 的基地址和限长加载到 GDTR 寄存器;
使用 "mov cr0" (Intel 386 及更高版本) 或 "lmsw" 指令 (与 Intel 286 兼容) 在 CR0 寄存器中设置 PE 标志;
立即执行远 "jmp" 或远 "call" 指令。
a20_done: { lidt idt_48; // load idt with 0, 0; // convert DS:gdt to a linear ptr *(long*)(gdt_48+2) = DS << 4 + &gdt; lgdt gdt_48; // reset coprocessor outb(0, 0xF0); delay(); outb(0, 0xF1); delay(); // reprogram the interrupts outb(0xFF, 0xA1); // mask all interrupts delay(); outb(0xFB, 0x21); // mask all irq's but irq2 which is cascaded // protected mode! AX = 1; lmsw ax; // machine status word, bit 0 thru 15 of CR0 // only affects PE, MP, EM & TS flags goto flush_instr; flush_instr: BX = 0; // flag to indicate a boot ESI = (CS - DELTA_INITSEG) << 4; // pointer to real-mode code /* NOTE: For high loaded big kernels we need a * jmpi 0x100000,__KERNEL_CS * * but we yet haven't reloaded the CS register, so the default size * of the target offset still is 16 bit. * However, using an operand prefix (0x66), the CPU will properly * take our 48 bit far pointer. (INTeL 80386 Programmer's Reference * Manual, Mixing 16-bit and 32-bit code, page 16-6) */ // goto __KERNEL_CS:[(uint32*)code32]; */ .byte 0x66, 0xea code32: .long 0x1000 // overwritten in Section 4.7 .word __KERNEL_CS // segment 0x10 // see linux/arch/i386/boot/compressed/head.S:startup_32 } |
控制权传递给 linux/arch/i386/boot/compressed/head.S:startup_32。对于 zImage,地址为 0x1000;对于 bzImage,地址为 0x100000。请参阅 第 5 节。
ESI 指向收集的系统数据的内存区域。它用于将参数从内核的 16 位实模式代码传递到 32 位部分。请参阅linux/Documentation/i386/zero-page.txt了解详情。
有关模式切换的详细信息,请参阅 IA-32 手册 Vol.3。(第 9.8 节. 保护模式操作的软件初始化,第 9.9.1 节. 切换到保护模式,以及第 17.4 节. 在混合大小代码段之间传输控制权)。
其余是支持函数和变量。
/* macros created by linux/Makefile targets: * include/linux/compile.h and include/linux/version.h */ kernel_version: .ascii UTS_RELEASE .ascii " (" .ascii LINUX_COMPILE_BY .ascii "@" .ascii LINUX_COMPILE_HOST .ascii ") " .ascii UTS_VERSION .byte 0 /////////////////////////////////////////////////////////////////////////////// default_switch() { cli; outb(0x80, 0x70); } /* disable interrupts and NMI */ bootsect_helper(ES:BX); /* see Section 3.7 */ /////////////////////////////////////////////////////////////////////////////// a20_test() { FS = 0; GS = 0xFFFF; CX = A20_TEST_LOOPS; // i.e. 32 AX = FS:[0x200]; do { a20_test_wait: FS:[0x200] = ++AX; delay(); } while (AX==GS:[0x210] && --CX); return (AX!=GS[0x210]); // ZF==0 (i.e. NZ/NE, a20_test!=0) means test passed } /////////////////////////////////////////////////////////////////////////////// // check that the keyboard command queue is empty empty_8042() { int timeout = 100000; for (;;) { empty_8042_loop: if (!--timeout) return; delay(); inb(0x64, &AL); // 8042 status port if (AL & 1) { // has output delay(); inb(0x60, &AL); // read it no_output: } else if (!(AL & 2)) return; // no input either } } /////////////////////////////////////////////////////////////////////////////// // read the CMOS clock, return the seconds in AL, used in video.S gettime() { int1A/AH=02h(); /* int1A/AH=02h: TIME - GET REAL-TIME CLOCK TIME * DH = seconds in BCD */ AL = DH & 0x0F; AH = DH >> 4; aad; } /////////////////////////////////////////////////////////////////////////////// delay() { outb(AL, 0x80); } // needed after doing I/O // Descriptor table gdt: .word 0, 0, 0, 0 # dummy .word 0, 0, 0, 0 # unused // segment 0x10, __KERNEL_CS .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) .word 0 # base address = 0 .word 0x9A00 # code read/exec .word 0x00CF # granularity = 4096, 386 # (+5th nibble of limit) // segment 0x18, __KERNEL_DS .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) .word 0 # base address = 0 .word 0x9200 # data read/write .word 0x00CF # granularity = 4096, 386 # (+5th nibble of limit) idt_48: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L /* [gdt_48] should be 0x0800 (2048) to match the comment, * like what Linux 2.2.22 does. */ gdt_48: .word 0x8000 # gdt limit=2048, # 256 GDT entries .word 0, 0 # gdt base (filled in later) #include "video.S" // signature at the end of setup.S: { setup_sig1: .word SIG1 // 0xAA55 setup_sig2: .word SIG2 // 0x5A5A modelist: } |
视频设置和检测代码在video.S:
ASK_VGA = 0xFFFD // defined in linux/include/asm-i386/boot.h /////////////////////////////////////////////////////////////////////////////// video() { pushw DS; // use different segments FS = DS; DS = ES = CS; GS = 0; cld; basic_detect(); // basic adapter type testing (EGA/VGA/MDA/CGA) #ifdef CONFIG_VIDEO_SELECT if (FS:[0x01FA]!=ASK_VGA) { // user selected video mode mode_set(); if (failed) { prtstr("You passed an undefined mode number.\n"); mode_menu(); } } else { vid2: mode_menu(); } vid1: #ifdef CONFIG_VIDEO_RETAIN restore_screen(); // restore screen contents #endif /* CONFIG_VIDEO_RETAIN */ #endif /* CONFIG_VIDEO_SELECT */ mode_params(); // store mode parameters popw ds; // restore original DS } |