关于编程生成的目标文件到底是怎么链接生成可执行文件的

以c/c++程序为例，要想生成可执行文件，需要经过如下步骤：

那么每一个目标文件到底是如何组合在一起，生成可执行文件的？

在linux中，对于目标文件和可执行代码，都是elf文件，这种文件表示了可执行代码应该如何被排放在内存中。

ELF文件

在这里插入图片描述从上述图中，可以看到对于ELF文件，其可以划分为4个部分：
ELF文件头，程序头表，节，节头表

ELF文件头：包含了ELF文件的一些信息，如：使用的编译平台(x86, AMD等)，32bit或者64bit等
程序头表：只在可执行文件中存在，指示哪些节应该放在虚拟内存中的哪个位置
节：真实存在的程序数据内容
节表：标识每个节存在的位置

上面四个部分分别对应<elf.h>中的四个数据结构

Elf64_Ehdr
Elf64_Phdr
Elf64_Shdr
Elf64_Sym

示例分析程序是如何加载的

先定义一个我们需要定义的函数在obj.c中

int add5(int num) {
    return num + 5;
}
int add10(int num) {
    return num + 10;
}

将其编译为目标文件

gcc -c obj.c

下面我们从另一个文件loader.c中对该函数进行调用：
首先定义一个load_obj函数，将目标文件的内容加载到内存中

static void load_obj(void)
{
    struct stat sb;

    int fd = open("obj.o", O_RDONLY);
    if (fd <= 0) {
        perror("Cannot open obj.o");
        exit(errno);
    }

    /* we need obj.o size for mmap(2) */
    if (fstat(fd, &sb)) {
        perror("Failed to get obj.o info");
        exit(errno);
    }

    /* mmap obj.o into memory */
    obj.base = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (obj.base == MAP_FAILED) {
        perror("Maping obj.o failed");
        exit(errno);
    }
    close(fd);
}

其中重点需要关注的为mmap函数，其负责将目标文件的内容加载到内存中
在这里插入图片描述
参数addr代表内存的起始地址，如果为NULL的话，有内核自动选择。
参数Length代表想要从文件读入内存的数据的多少
参数prot代表读入的内存的权限,可执行，可读，可写，不可访问
返回开始的内存地址
综上：如果load_obj函数成功执行，那么就会将目标文件读取到内存地址addr。

然后在定义一个解析函数parse_obj

static void parse_obj(void)
{
    /* the sections table offset is encoded in the ELF header */
    sections = (const Elf64_Shdr *)(obj.base + obj.hdr->e_shoff);
    /* the index of `.shstrtab` in the sections table is encoded in the ELF header
     * so we can find it without actually using a name lookup
     */
    shstrtab = (const char *)(obj.base + sections[obj.hdr->e_shstrndx].sh_offset);

    /* find the `.symtab` entry in the sections table */
    const Elf64_Shdr *symtab_hdr = lookup_section(".symtab");
    if (!symtab_hdr) {
        fputs("Failed to find .symtab\n", stderr);
        exit(ENOEXEC);
    }

    /* the symbols table */
    symbols = (const Elf64_Sym *)(obj.base + symtab_hdr->sh_offset);
    /* number of entries in the symbols table = table size / entry size */
    num_symbols = symtab_hdr->sh_size / symtab_hdr->sh_entsize;

    const Elf64_Shdr *strtab_hdr = lookup_section(".strtab");
    if (!strtab_hdr) {
        fputs("Failed to find .strtab\n", stderr);
        exit(ENOEXEC);
    }

    strtab = (const char *)(obj.base + strtab_hdr->sh_offset);

    /* get system page size */
    page_size = sysconf(_SC_PAGESIZE);

    /* find the `.text` entry in the sections table */
    const Elf64_Shdr *text_hdr = lookup_section(".text");
    if (!text_hdr) {
        fputs("Failed to find .text\n", stderr);
        exit(ENOEXEC);
    }

    /* allocate memory for `.text` copy rounding it up to whole pages */
    text_runtime_base = mmap(NULL, page_align(text_hdr->sh_size), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (text_runtime_base == MAP_FAILED) {
        perror("Failed to allocate memory for .text");
        exit(errno);
    }

    /* copy the contents of `.text` section from the ELF file */
    memcpy(text_runtime_base, obj.base + text_hdr->sh_offset, text_hdr->sh_size);

    /* make the `.text` copy readonly and executable */
    if (mprotect(text_runtime_base, page_align(text_hdr->sh_size), PROT_READ | PROT_EXEC)) {
        perror("Failed to make .text executable");
        exit(errno);
    }
}

其中obj是一个结构体，其定义如下：

typedef union {
    const Elf64_Ehdr *hdr;
    const uint8_t *base;
} objhdr;

在load_obj函数中，obj已经指向了目标文件开始的位置
关于Elf64_Ehdr结构体的定义：

在这里插入图片描述

Elf64_Half --> 2Byte
Elf64_Word --> 4Byte
Elf64_Addr --> 4Byte
Elf64_Off  ->> 4Byte

其中EI_NINDENT为一个常数：16
我们调试程序，得到其执行完load_obj函数之后，obj结构体对应的结果：
在这里插入图片描述
我们利用hexdump工具，查看目标函数obj.o中的数据

Elf64_Ehdr结构体中的e_ident代表了一组魔数，一共16个字节，可以看到和obj.o文件中的数据相同
e_type两个字节了，代表了Elf文件的类型:

从上面调试的结果可以看到，obj.o文件为Relocatable file
e_machine两个字节，代表编译文件的系统架构
e_version四个字节，代表了文件使用Elf的版本
在这里插入图片描述
e_entry四个字节，代表了入口的虚拟内存地址
e_phoff四个字节，代表了程序表头相对于入口地址的偏移量
e_shoff四个字节，代表了节表头相对于入口地址的偏移量
e_epsize两个字节，代表了Elf文件头的大小
e_phentsize两个字节，代表了程序头表中一个数据的大小
e_phnum两个字节，代表了程序头表中数据的个数
e_shentsize两个字节，代表了节头表中一个数据的大小
e_shnum两个字节，代表了节头表中数据的个数
e_shstrndex两个字节，代表字符串表在节头表中的下标

下面对parse_obj函数中的每一行代码进行分析：

sections = (const Elf64_Shdr *)(obj.base + obj.hdr->e_shoff);

找到节表的在内存中的地址

shstrtab = (const char *)(obj.base + sections[obj.hdr->e_shstrndx].sh_offset);

这里有一个新的数据结构Elf64_Shdr
在这里插入图片描述
首先利用obj.hdr->e_shstrndx找到内存中节表中字符串表在节表中的位置，然后利用sh_offset找到字符串表在内存中的位置，既最终shstrtab指向字符串表在内存中的地址。

const Elf64_Shdr *symtab_hdr = lookup_section(".symtab");
    if (!symtab_hdr) {
        fputs("Failed to find .symtab\n", stderr);
        exit(ENOEXEC);
    }

上述代码找打.symtab在节表中的数据，先假设lookup_seciton可以正确地找到，并返回指定节的信息，后面在对其及进行分析。

symbols = (const Elf64_Sym *)(obj.base + symtab_hdr->sh_offset);

读取符号节表的信息

num_symbols = symtab_hdr->sh_size / symtab_hdr->sh_entsize;

得到符号节表中有多少个元数据

const Elf64_Shdr *strtab_hdr = lookup_section(".strtab");
    if (!strtab_hdr) {
        fputs("Failed to find .strtab\n", stderr);
        exit(ENOEXEC);
    }

strtab = (const char *)(obj.base + strtab_hdr->sh_offset);

找到.strtab节，然后将strtab指向.strtab的内存地址

page_size = sysconf(_SC_PAGESIZE);

得到系统页大小

const Elf64_Shdr *text_hdr = lookup_section(".text");
	if (!text_hdr) {
		fputs("Failed to find .text\n", stderr);
		exit(ENOEXEC);
}

找到.text节的信息

	text_runtime_base = mmap(NULL, page_align(text_hdr->sh_size), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (text_runtime_base == MAP_FAILED) {
        perror("Failed to allocate memory for .text");
        exit(errno);
    }

需要注意，这里在fd参数的位置传入-1，将会导致只分配指定大小page_align(text_hdr->sh_size)大小的内存空间，而不填入数据。

memcpy(text_runtime_base, obj.base + text_hdr->sh_offset, text_hdr->sh_size);

将.text节中的数据写入到分配的内存空间中

    if (mprotect(text_runtime_base, page_align(text_hdr->sh_size), PROT_READ | PROT_EXEC)) {
        perror("Failed to make .text executable");
        exit(errno);
    }

将拷贝到内存中.text的数据变成只读和可执行

下面分析关于lookup_section函数

static const Elf64_Shdr *lookup_section(const char *name)
{
    size_t name_len = strlen(name);

    /* number of entries in the sections table is encoded in the ELF header */
    for (Elf64_Half i = 0; i < obj.hdr->e_shnum; i++) {
        /* sections table entry does not contain the string name of the section
         * instead, the `sh_name` parameter is an offset in the `.shstrtab`
         * section, which points to a string name
         */
        const char *section_name = shstrtab + sections[i].sh_name;
        size_t section_name_len = strlen(section_name);

        if (name_len == section_name_len && !strcmp(name, section_name)) {
            /* we ignore sections with 0 size */
            if (sections[i].sh_size)
                return sections + i;
        }
    }

    return NULL;
}

就是遍历节表中所有元数据，找到和执行节表名称相同的元数据返回

下面需要调用obj.o中的函数

static void execute_funcs(void)
{
    /* pointers to imported add5 and add10 functions */
    int (*add5)(int);
    int (*add10)(int);

    add5 = lookup_function("add5");
    if (!add5) {
        fputs("Failed to find add5 function\n", stderr);
        exit(ENOENT);
    }

    puts("Executing add5...");
    printf("add5(%d) = %d\n", 42, add5(42));

    add10 = lookup_function("add10");
    if (!add10) {
        fputs("Failed to find add10 function\n", stderr);
        exit(ENOENT);
    }

    puts("Executing add10...");
    printf("add10(%d) = %d\n", 42, add10(42));
}

大致流程就是利用函数指针，然后通过lookup_function找到函数，并将其地址进行返回，最后通过函数指针来调用函数。
下面为lookup_function函数：

static void *lookup_function(const char *name)
{
    size_t name_len = strlen(name);

    /* loop through all the symbols in the symbol table */
    for (int i = 0; i < num_symbols; i++) {
        /* consider only function symbols */
        if (ELF64_ST_TYPE(symbols[i].st_info) == STT_FUNC) {
            /* symbol table entry does not contain the string name of the symbol
             * instead, the `st_name` parameter is an offset in the `.strtab`
             * section, which points to a string name
             */
            const char *function_name = strtab + symbols[i].st_name;
            size_t function_name_len = strlen(function_name);

            if (name_len == function_name_len && !strcmp(name, function_name)) {
                /* st_value is an offset in bytes of the function from the
                 * beginning of the `.text` section
                 */
                return text_runtime_base + symbols[i].st_value;
            }
        }
    }

    return NULL;
}

函数，全局变量，静态变量的信息都存放在symbol table中，下面是Elf64_Sym的数据结构：
在这里插入图片描述
参数st_name表示，该符号量对应的名称在字符串表中的下标
参数st_info表示，该符号量是函数，还是全局变量，静态变量等
参数st_shndx表示，该符号量在哪一个节中
参数st_value表示，该符号量的值，函数地址，值（取决于info)

由于我们需要找到的函数的信息:

找到函数类型的符号量
检查函数名是否相同
找到符合符号量之后，返回其对应的地址

完整的程序代码为：

/* compile and link:
 * $ gcc -o loader loader.c
 */

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

/* for open(2), fstat(2) */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

/* for close(2), fstat(2) */
#include <unistd.h>

/* for mmap(2) */
#include <sys/mman.h>

/* parsing ELF files */
#include <elf.h>

/* for errno */
#include <errno.h>

typedef union {
    const Elf64_Ehdr *hdr;
    const uint8_t *base;
} objhdr;

/* obj.o memory address */
static objhdr obj;

/* sections table */
static const Elf64_Shdr *sections;
static const char *shstrtab = NULL;

/* symbols table */
static const Elf64_Sym *symbols;
/* number of entries in the symbols table */
static int num_symbols;
static const char *strtab = NULL;

static uint64_t page_size;

/* runtime base address of the imported code */
static uint8_t *text_runtime_base;

static inline uint64_t page_align(uint64_t n)
{
    return (n + (page_size - 1)) & ~(page_size - 1);
}

static void load_obj(void)
{
    struct stat sb;

    int fd = open("obj.o", O_RDONLY);
    if (fd <= 0) {
        perror("Cannot open obj.o");
        exit(errno);
    }

    /* we need obj.o size for mmap(2) */
    if (fstat(fd, &sb)) {
        perror("Failed to get obj.o info");
        exit(errno);
    }

    /* mmap obj.o into memory */
    obj.base = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (obj.base == MAP_FAILED) {
        perror("Maping obj.o failed");
        exit(errno);
    }
    close(fd);
}

static const Elf64_Shdr *lookup_section(const char *name)
{
    size_t name_len = strlen(name);

    /* number of entries in the sections table is encoded in the ELF header */
    for (Elf64_Half i = 0; i < obj.hdr->e_shnum; i++) {
        /* sections table entry does not contain the string name of the section
         * instead, the `sh_name` parameter is an offset in the `.shstrtab`
         * section, which points to a string name
         */
        const char *section_name = shstrtab + sections[i].sh_name;
        size_t section_name_len = strlen(section_name);

        if (name_len == section_name_len && !strcmp(name, section_name)) {
            /* we ignore sections with 0 size */
            if (sections[i].sh_size)
                return sections + i;
        }
    }

    return NULL;
}

static void *lookup_function(const char *name)
{
    size_t name_len = strlen(name);

    /* loop through all the symbols in the symbol table */
    for (int i = 0; i < num_symbols; i++) {
        /* consider only function symbols */
        if (ELF64_ST_TYPE(symbols[i].st_info) == STT_FUNC) {
            /* symbol table entry does not contain the string name of the symbol
             * instead, the `st_name` parameter is an offset in the `.strtab`
             * section, which points to a string name
             */
            const char *function_name = strtab + symbols[i].st_name;
            size_t function_name_len = strlen(function_name);

            if (name_len == function_name_len && !strcmp(name, function_name)) {
                /* st_value is an offset in bytes of the function from the
                 * beginning of the `.text` section
                 */
                return text_runtime_base + symbols[i].st_value;
            }
        }
    }

    return NULL;
}

static void parse_obj(void)
{
    /* the sections table offset is encoded in the ELF header */
    sections = (const Elf64_Shdr *)(obj.base + obj.hdr->e_shoff);
    /* the index of `.shstrtab` in the sections table is encoded in the ELF header
     * so we can find it without actually using a name lookup
     */
    shstrtab = (const char *)(obj.base + sections[obj.hdr->e_shstrndx].sh_offset);

    /* find the `.symtab` entry in the sections table */
    const Elf64_Shdr *symtab_hdr = lookup_section(".symtab");
    if (!symtab_hdr) {
        fputs("Failed to find .symtab\n", stderr);
        exit(ENOEXEC);
    }

    /* the symbols table */
    symbols = (const Elf64_Sym *)(obj.base + symtab_hdr->sh_offset);
    /* number of entries in the symbols table = table size / entry size */
    num_symbols = symtab_hdr->sh_size / symtab_hdr->sh_entsize;

    const Elf64_Shdr *strtab_hdr = lookup_section(".strtab");
    if (!strtab_hdr) {
        fputs("Failed to find .strtab\n", stderr);
        exit(ENOEXEC);
    }

    strtab = (const char *)(obj.base + strtab_hdr->sh_offset);

    /* get system page size */
    page_size = sysconf(_SC_PAGESIZE);

    /* find the `.text` entry in the sections table */
    const Elf64_Shdr *text_hdr = lookup_section(".text");
    if (!text_hdr) {
        fputs("Failed to find .text\n", stderr);
        exit(ENOEXEC);
    }

    /* allocate memory for `.text` copy rounding it up to whole pages */
    text_runtime_base = mmap(NULL, page_align(text_hdr->sh_size), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (text_runtime_base == MAP_FAILED) {
        perror("Failed to allocate memory for .text");
        exit(errno);
    }

    /* copy the contents of `.text` section from the ELF file */
    memcpy(text_runtime_base, obj.base + text_hdr->sh_offset, text_hdr->sh_size);

    /* make the `.text` copy readonly and executable */
    if (mprotect(text_runtime_base, page_align(text_hdr->sh_size), PROT_READ | PROT_EXEC)) {
        perror("Failed to make .text executable");
        exit(errno);
    }
}

static void execute_funcs(void)
{
    /* pointers to imported add5 and add10 functions */
    int (*add5)(int);
    int (*add10)(int);

    add5 = lookup_function("add5");
    if (!add5) {
        fputs("Failed to find add5 function\n", stderr);
        exit(ENOENT);
    }

    puts("Executing add5...");
    printf("add5(%d) = %d\n", 42, add5(42));

    add10 = lookup_function("add10");
    if (!add10) {
        fputs("Failed to find add10 function\n", stderr);
        exit(ENOENT);
    }

    puts("Executing add10...");
    printf("add10(%d) = %d\n", 42, add10(42));
}

int main(void)
{
    load_obj();
    parse_obj();
    execute_funcs();

    return 0;
}