5. linux/arch/i386/boot/compressed/head.S

我们现在位于 bvmlinux 中!在 misc.c:decompress_kernel() 的帮助下,我们将要解压缩 piggy.o 以获得驻留内核镜像linux/vmlinux.

此文件是纯 32 位启动代码。与之前的两个文件不同,它在源文件中没有 ".code16" 语句。有关详细信息,请参阅 Using as: Writing 16-bit Code

5.1. 解压缩内核

段描述符(对应于段选择符 __KERNEL_CS 和 __KERNEL_DS)中的段基址等于 0;因此,如果使用这些段选择符中的任何一个,则逻辑地址偏移(以段:偏移格式)将等于其线性地址。对于 zImage,CS:EIP 现在的逻辑地址为 10:1000(线性地址 0x1000);对于 bzImage,逻辑地址为 10:100000(线性地址 0x100000)。

由于分页未启用,线性地址与物理地址相同。有关地址问题,请查看 IA-32 手册(卷 1 第 3.3 章 内存组织,以及卷 3 第 3 章 保护模式内存管理)和 Linux 设备驱动程序:Linux 中的内存管理

它来自setup.S其中 BX=0 且 ESI=INITSEG<<4。

.text
///////////////////////////////////////////////////////////////////////////////
startup_32()
{
        cld;
        cli;
        DS = ES = FS = GS = __KERNEL_DS;
        SS:ESP = *stack_start;  // end of user_stack[], defined in misc.c
        // all segment registers are reloaded after protected mode is enabled

        // check that A20 really IS enabled
        EAX = 0;
        do {
1:              DS:[0] = ++EAX;
        } while (DS:[0x100000]==EAX);

        EFLAGS = 0;
        clear BSS;                              // from _edata to _end

        struct moveparams mp;                   // subl $16,%esp
        if (!decompress_kernel(&mp, ESI)) {     // return value in AX
                restore ESI from stack;
                EBX = 0;
                goto __KERNEL_CS:100000;
                // see linux/arch/i386/kernel/head.S:startup_32
        }

        /*
         * We come here, if we were loaded high.
         * We need to move the move-in-place routine down to 0x1000
         * and then start it with the buffer addresses in registers,
         * which we got from the stack.
         */
3:      move move_rountine_start..move_routine_end to 0x1000;
        // move_routine_start & move_routine_end are defined below

        // prepare move_routine_start() parameters
        EBX = real mode pointer;        // ESI value passed from setup.S
        ESI = mp.low_buffer_start;
        ECX = mp.lcount;
        EDX = mp.high_buffer_star;
        EAX = mp.hcount;
        EDI = 0x100000;
        cli;                    // make sure we don't get interrupted
        goto __KERNEL_CS:1000;  // move_routine_start();
}

/* Routine (template) for moving the decompressed kernel in place,
 * if we were high loaded. This _must_ PIC-code ! */
///////////////////////////////////////////////////////////////////////////////
move_routine_start()
{
        move mp.low_buffer_start to 0x100000, mp.lcount bytes,
          in two steps: (lcount >> 2) words + (lcount & 3) bytes;
        move/append mp.high_buffer_start, ((mp.hcount + 3) >> 2) words
        // 1 word == 4 bytes, as I mean 32-bit code/data.

        ESI = EBX;              // real mode pointer, as that from setup.S
        EBX = 0;
        goto __KERNEL_CS:100000;
        // see linux/arch/i386/kernel/head.S:startup_32()
move_routine_end:
}
有关 "je 1b" 和 "jnz 3f" 的含义,请参阅 Using as: Local Symbol Names

没有找到 _edata_end 的定义?没问题,它们在“内部链接脚本”中定义。在没有指定 -T (--script=) 选项的情况下,ld 使用此内建脚本来链接 compressed/bvmlinux。使用“ld --verbose”显示此脚本,或查看附录 B。内部链接脚本

有关 -T (--script=)、-L (--library-path=) 和 --verbose 选项的描述,请参阅 Using LD, the GNU linker: Command Line Options。“man ld”和“info ld”也可能有所帮助。

piggy.o 已解压缩,控制权已传递给 __KERNEL_CS:100000,即 linux/arch/i386/kernel/head.S:startup_32()。请参阅 第 6 节

#define LOW_BUFFER_START      0x2000
#define LOW_BUFFER_MAX       0x90000
#define HEAP_SIZE             0x3000
///////////////////////////////////////////////////////////////////////////////
asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
|-- setup real_mode(=rmode), vidmem, vidport, lines and cols;
|-- if (is_zImage) setup_normal_output_buffer() {
|       output_data      = 0x100000;
|       free_mem_end_ptr = real_mode;
|   } else (is_bzImage) setup_output_buffer_if_we_run_high(mv) {
|       output_data      = LOW_BUFFER_START;
|       low_buffer_end   = MIN(real_mode, LOW_BUFFER_MAX) & ~0xfff;
|       low_buffer_size  = low_buffer_end - LOW_BUFFER_START;
|       free_mem_end_ptr = &end + HEAP_SIZE;
|       // get mv->low_buffer_start and mv->high_buffer_start
|       mv->low_buffer_start = LOW_BUFFER_START;
|       /* To make this program work, we must have
|        *   high_buffer_start > &end+HEAP_SIZE;
|        * As we will move low_buffer from LOW_BUFFER_START to 0x100000
|        *   (max low_buffer_size bytes) finally, we should have
|        *   high_buffer_start > 0x100000+low_buffer_size; */
|       mv->high_buffer_start = high_buffer_start
|           = MAX(&end+HEAP_SIZE, 0x100000+low_buffer_size);
|       mv->hcount =  0 if (0x100000+low_buffer_size >  &end+HEAP_SIZE);
|                  = -1 if (0x100000+low_buffer_size <= &end+HEAP_SIZE);
|       /* mv->hcount==0 : we need not move high_buffer later,
|        *   as it is already at 0x100000+low_buffer_size.
|        * Used by close_output_buffer_if_we_run_high() below. */
|   }
|-- makecrc();          // create crc_32_tab[]
|   puts("Uncompressing Linux... ");
|-- gunzip();
|   puts("Ok, booting the kernel.\n");
|-- if (is_bzImage) close_output_buffer_if_we_run_high(mv) {
|       // get mv->lcount and mv->hcount
|       if (bytes_out > low_buffer_size) {
|           mv->lcount = low_buffer_size;
|           if (mv->hcount)
|               mv->hcount = bytes_out - low_buffer_size;
|       } else {
|           mv->lcount = bytes_out;
|           mv->hcount = 0;
|       }
|   }
`-- return is_bzImage;  // return value in AX
end 也在“内部链接脚本”中定义。

decompress_kernel() 具有 "asmlinkage" 修饰符。在linux/include/linux/linkage.h:
#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
#define CPP_ASMLINKAGE
#endif

#if defined __i386__
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
#elif defined __ia64__
#define asmlinkage CPP_ASMLINKAGE __attribute__((syscall_linkage))
#else
#define asmlinkage CPP_ASMLINKAGE
#endif
宏 "asmlinkage" 将强制编译器将所有函数参数通过堆栈传递,以防某些优化方法可能尝试更改此约定。查看 Using the GNU Compiler Collection (GCC): Declaring Attributes of Functions (regparm) 和 Kernelnewbies FAQ: What is asmlinkage 以了解更多详细信息。

5.2. gunzip()

decompress_kernel() 调用 gunzip() -> inflate(),它们定义在linux/lib/inflate.c中,以将驻留内核镜像解压缩到低缓冲区(由 output_data 指向)和高缓冲区(仅适用于 bzImage,由 high_buffer_start 指向)。

gzip 文件格式在 RFC 1952 中指定。

表 6. gzip 文件格式

组件含义字节注释
ID1标识符 1131 (0x1f, \037)
ID2标识符 21139 (0x8b, \213) [a]
CM压缩方法18 - 表示 “deflate” 压缩方法
FLG标志1大多数情况下为 0
MTIME修改时间4原始文件的修改时间
XFL额外标志12 - 压缩器使用最大压缩率,最慢的算法 [b]
OS操作系统13 - Unix
额外字段--可变长度,由 FLG 指示的字段 [c]
压缩块--可变长度
CRC32-4未压缩数据的 CRC 值
ISIZE输入大小4未压缩输入数据的大小模 2^32
注释
a. 对于 gzip 0.5,ID2 值可以为 158 (0x9e, \236);
b. XFL 值 4 - 压缩器使用最快算法;
c. FLG 位 0,FTEXT,不指示任何“额外字段”。

我们可以使用此文件格式知识来找出 gzipped 的开头linux/vmlinux.
[root@localhost boot]# hexdump -C /boot/vmlinuz-2.4.20-28.9 | grep '1f 8b 08 00'
00004c50  1f 8b 08 00 01 f6 e1 3f  02 03 ec 5d 7d 74 14 55  |.......?...]}t.U|
[root@localhost boot]# hexdump -C /boot/vmlinuz-2.4.20-28.9 -s 0x4c40 -n 64
00004c40  00 80 0b 00 00 fc 21 00  68 00 00 00 1e 01 11 00  |......!.h.......|
00004c50  1f 8b 08 00 01 f6 e1 3f  02 03 ec 5d 7d 74 14 55  |.......?...]}t.U|
00004c60  96 7f d5 a9 d0 1d 4d ac  56 93 35 ac 01 3a 9c 6a  |......M.V.5..:.j|
00004c70  4d 46 5c d3 7b f8 48 36  c9 6c 84 f0 25 88 20 9f  |MF\.{.H6.l..%. .|
00004c80
[root@localhost boot]# hexdump -C /boot/vmlinuz-2.4.20-28.9 | tail -n 4
00114d40  bd 77 66 da ce 6f 3d d6  33 5c 14 a2 9f 7e fa e9  |.wf..o=.3\...~..|
00114d50  a7 9f 7e fa ff 57 3f 00  00 00 00 00 d8 bc ab ea  |..~..W?.........|
00114d60  44 5d 76 d1 fd 03 33 58  c2 f0 00 51 27 00        |D]v...3X...Q'.|
00114d6e
我们可以看到,在上面的示例中,gzipped 文件从 0x4c50 开始。“1f 8b 08 00”之前的四个字节是 input_len(小端序为 0x0011011e),并且 0x4c50+0x0011011e=0x114d6e 等于 bzImage 的大小(/boot/vmlinuz-2.4.20-28.9).

static uch *inbuf;           /* input buffer */
static unsigned insize = 0;  /* valid bytes in inbuf */
static unsigned inptr = 0;   /* index of next byte to be processed in inbuf */
///////////////////////////////////////////////////////////////////////////////
static int gunzip(void)
{
        Check input buffer for {ID1, ID2, CM}, must be
                {0x1f, 0x8b, 0x08} (normal case), or
                {0x1f, 0x9e, 0x08} (for gzip 0.5);
        Check FLG (flag byte), must not set bit 1, 5, 6 and 7;
        Ignore {MTIME, XFL, OS};
        Handle optional structures, which correspond to FLG bit 2, 3 and 4;
        inflate();              // handle compressed blocks
        Validate {CRC32, ISIZE};
}
当首次调用在linux/arch/i386/boot/compressed/misc.c中定义的 get_byte() 时,它会调用 fill_inbuf() 来设置输入缓冲区 inbuf=input_datainsize=input_len。符号 input_datainput_lenpiggy.o 链接脚本中定义。请参阅 第 2.5 节

5.3. inflate()

// some important definitions in misc.c
#define WSIZE 0x8000            /* Window size must be at least 32k,
                                 * and a power of two */
static uch window[WSIZE];       /* Sliding window buffer */
static unsigned outcnt = 0;     /* bytes in output buffer */

// linux/lib/inflate.c
#define wp outcnt
#define flush_output(w) (wp=(w),flush_window())
STATIC unsigned long bb;        /* bit buffer */
STATIC unsigned bk;             /* bits in bit buffer */
STATIC unsigned hufts;          /* track memory usage */
static long free_mem_ptr = (long)&end;
///////////////////////////////////////////////////////////////////////////////
STATIC int inflate()
{
        int e;                  /* last block flag */
        int r;                  /* result code */
        unsigned h;             /* maximum struct huft's malloc'ed */
        void *ptr;

        wp = bb = bk = 0;

        // inflate compressed blocks one by one
        do {
                hufts = 0;
                gzip_mark() { ptr = free_mem_ptr; };
                if ((r = inflate_block(&e)) != 0) {
                        gzip_release() { free_mem_ptr = ptr; };
                        return r;
                }
                gzip_release() { free_mem_ptr = ptr; };
                if (hufts > h)
                h = hufts;
        } while (!e);

        /* Undo too much lookahead. The next read will be byte aligned so we
         * can discard unused bits in the last meaningful byte. */
        while (bk >= 8) {
                bk -= 8;
                inptr--;
        }

        /* write the output window window[0..outcnt-1] to output_data,
         * update output_ptr/output_data, crc and bytes_out accordingly, and
         * reset outcnt to 0. */
        flush_output(wp);

        /* return success */
        return 0;
}
free_mem_ptrmisc.c:malloc() 中用于动态内存分配。在解压缩每个压缩块之前,gzip_mark() 会保存 free_mem_ptr 的值;解压缩后,gzip_release() 将恢复此值。这就是它如何“free()”在 inflate_block() 中分配的内存。

Gzip 使用 Lempel-Ziv 编码 (LZ77) 来压缩文件。压缩数据格式在 RFC 1951 中指定。inflate_block() 将解压缩压缩块,这些压缩块可以被视为位序列。

每个压缩块的数据结构概述如下
BFINAL (1 bit)
    0  - not the last block
    1  - the last block
BTYPE  (2 bits)
    00 - no compression
        remaining bits until the byte boundary;
        LEN      (2 bytes);
        NLEN     (2 bytes, the one's complement of LEN);
        data     (LEN bytes);
    01 - compressed with fixed Huffman codes
        {
        literal  (7-9 bits, represent code 0..287, excluding 256);
                     // See RFC 1951, table in Paragraph 3.2.6.
        length   (0-5 bits if literal > 256, represent length 3..258);
                     // See RFC 1951, 1st alphabet table in Paragraph 3.2.5.
        data     (of literal bytes if literal < 256);
        distance (5 plus 0-13 extra bits if literal == 257..285, represent
                         distance 1..32768);
                     /* See RFC 1951, 2nd alphabet table in Paragraph 3.2.5,
                      *   but statement in Paragraph 3.2.6. */
                     /* Move backward "distance" bytes in the output stream,
                      * and copy "length" bytes */
        }*           // can be of multiple instances
        literal  (7 bits, all 0, literal == 256, means end of block);
    10 - compressed with dynamic Huffman codes
        HLIT     (5 bits, # of Literal/Length codes - 257, 257-286);
        HDIST    (5 bits, # of Distance codes - 1,         1-32);
        HCLEN    (4 bits, # of Code Length codes - 4,      4 - 19);
        Code Length sequence    ((HCLEN+4)*3 bits)
        /* The following two alphabet tables will be decoded using
         *   the Huffman decoding table which is generated from
         *   the preceeding Code Length sequence. */
        Literal/Length alphabet (HLIT+257 codes)
        Distance alphabet       (HDIST+1 codes)
        // Decoding tables will be built from these alphpabet tables.
        /* The following is similar to that of fixed Huffman codes portion,
         *   except that they use different decoding tables. */
        {
        literal/length
                 (variable length, depending on Literal/Length alphabet);
        data     (of literal bytes if literal < 256);
        distance (variable length if literal == 257..285, depending on
                         Distance alphabet);
        }*           // can be of multiple instances
        literal  (literal value 256, which means end of block);
    11 - reserved (error)
请注意,数据元素从最低有效位 (LSB) 到最高有效位 (MSB) 打包成字节,而 Huffman 代码则从 MSB 开始打包。另请注意,字面值 286-287 和距离 代码 30-31 实际上永远不会出现。

记住上述数据结构并手边备有 RFC 1951,理解 inflate_block() 并不太难。有关 Huffman 编码和字母表表生成的更多信息,请参阅 RFC 1951 中的相关段落。

有关更多详细信息,请参阅linux/lib/inflate.cgzip 源代码(许多内联注释)和相关参考资料。

5.4. 参考