深入理解Linux内核网络（一）：内核接收数据包的过程

在应用层执行read调用后就能很方便地接收到来自网络的另一端发送过来的数据，其实在这一行代码下隐藏着非常多的内核组件细节工作。在本节中，将详细讲解数据包如何从内核到应用层，以intel igb网卡为例。

部分内容来源于《深入理解Linux网络》、《Linux内核源码分析TCP实现》

网络收包总览

Linux内核以及网卡驱动主要实现链路层、网络层和传输层这三层上的功能，内核为更上面的应用层提供socket连接来支持用户进程访问。分层图如下：

在这里插入图片描述

当网络设备有数据到达时，它会通过中断信号通知CPU。这种通知是通过电压变化实现的，目的是让CPU立即处理数据。如果在中断处理函数中完成所有的工作，会导致CPU长时间被占用，从而无法响应其他重要的设备输入（如鼠标和键盘）。这会影响系统的整体响应能力。

为了解决上述问题，Linux将中断处理分为“上半部”和“下半部”：

上半部：负责进行快速、简单的工作，比如确认中断的发生，读取基本数据等。这个部分的目标是尽快释放CPU，以便可以处理其他中断。
下半部：将大部分复杂的处理工作放在这里，这样可以在更低的优先级下、在CPU空闲时进行处理。

软中断：在Linux 2.4版本及以后的版本中，下半部的处理主要通过软中断实现。软中断不依赖于物理电压变化，而是通过内存中的变量来标记是否有软中断需要处理。ksoftirqd是一个内核线程，专门负责处理这些软中断。

在这里插入图片描述

Linux启动

创建ksoftirqd内核线程

Linux系统中软中断在ksoftirqd内核线程中处理，Linux会创建和CPU核数相等的ksoftirqd线程。

在这里插入图片描述

static struct smp_hotplug_thread softirq_threads = {
    .store = &ksoftirqd,
    .thread_should_run = ksoftirqd_should_run,
    .thread_fn = run_ksoftirqd,
    .thread_comm = "ksoftirqd/%u",
};
static _init int spawn_ksoftirqd(void) {
    register_cpu_notifier(&cpu_nfb);
    BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
    return 0;
}
early_initcall(spawn_ksoftirqd);

网络子系统初始化

网络子系统初始化会为每个CPU初始化softnet_data，也会为RX_SOFTIRQ（接收软终端）和TX_SOFTIRQ（发送软中断）注册处理函数。

在这里插入图片描述

Linux通过调用subsys_initcall来初始化子系统。subsys_initcall(net_dev_init); net_dev_init是初始化网络子系统的函数。

//file: net/core/dev.c
static int __init net_dev_init(void) {
    // 为每个CPU初始化soft_net
    for_each_possible_cpu(i) {
    struct work_struct *flush = per_cpu_ptr(&flush_works, i);
    struct softnet_data *sd = &per_cpu(softnet_data, i);

    INIT_WORK(flush, flush_backlog);

    skb_queue_head_init(&sd->input_pkt_queue);
    skb_queue_head_init(&sd->process_queue);
    ......
    }
    ......
    // 注册TX，RX软中断处理函数
    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
    open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}

sortnet_data数据结构中的poll_list等待驱动程序将其poll函数注册进来。open_softirq函数用于为每种软中断类型注册一个处理函数。将特定软中断编号（如NET_TX_SOFTIRQ和NET_RX_SOFTIRQ）与其对应的处理函数（如net_tx_action和net_rx_action）关联起来。这些处理函数负责处理发送（TX）和接收（RX）网络数据包的逻辑。

通过softirq_vec变量注册软中断处理函数，ksoftirqd线程通过该变量查找中断处理函数处理对应软中断。

// file: kernel/softirq.c
void open_softirq(int nr, void (*action)(struct softirq_action *)) {
    softirq_vec[nr].action = action;
}

协议栈注册

内核实现了ip_rcv，tcp_rcv，udp_rcv这些网络协议处理函数，由内核进行注册使用。

在这里插入图片描述

//file: net/ipv4/af_inet.c

static struct packet_type ip_packet_type __read_mostly = {
	.type = cpu_to_be16(ETH_P_IP),
	.func = ip_rcv,
	.list_func = ip_list_rcv,
};

/* thinking of making this const? Don't.
 * early_demux can change based on sysctl.
 */
static struct net_protocol tcp_protocol = {
	.early_demux	=	tcp_v4_early_demux,
	.early_demux_handler =  tcp_v4_early_demux,
	.handler	=	tcp_v4_rcv,
	.err_handler	=	tcp_v4_err,
	.no_policy	=	1,
	.netns_ok	=	1,
	.icmp_strict_tag_validation = 1,
};

/* thinking of making this const? Don't.
 * early_demux can change based on sysctl.
 */
static struct net_protocol udp_protocol = {
	.early_demux =	udp_v4_early_demux,
	.early_demux_handler =	udp_v4_early_demux,
	.handler =	udp_rcv,
	.err_handler =	udp_err,
	.no_policy =	1,
	.netns_ok =	1,
};

static const struct net_protocol icmp_protocol = {
	.handler =	icmp_rcv,
	.err_handler =	icmp_err,
	.no_policy =	1,
	.netns_ok =	1,
};
//file: net/ipv4/af_inet.c
static int __init inet_init(void) {
	// 注册TCP,UDP,ICMP网络层之上协议
	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
}

从上面的代码中可以看到，udp_protocol结构体中的handler是udp_rcv，tcp_protocol结构体中的handler是tcp_v4_rcv，它们通过inet_add_protocol函数被初始化进来。

// file: net/ipv4/protocol.c
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
	if (!prot->netns_ok) {
		pr_err("Protocol %u is not namespace aware, cannot register.\n",
			protocol);
		return -EINVAL;
	}

	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
			NULL, prot) ? 0 : -1;
}

// 导出符号，供其他模块使用
// 注册协议后，其他模块就可以通过该变量相应协议的处理函数了
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_protos);

inet_add_protocol函数用于将TCP和UDP协议对应的处理函数注册到inet_protos数组中，从而使内核能够在接收到这些协议的数据包时找到相应的处理逻辑。同时，调用 dev_add_pack(&ip_packet_type) 将ip_packet_type结构体中的协议名称和处理函数 ip_rcv 注册到 ptype_base 哈希表中。

void dev_add_pack(struct packet_type *pt)
{
	struct list_head *head = ptype_head(pt);
	......
}

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
	if (pt->type == htons(ETH_P_ALL))
		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
	else
		return pt->dev ? &pt->dev->ptype_specific :
				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

这样inet_protos记录着UDP，TCP协议处理函数地址，ptype_base记录ip协议处理函数地址。

网卡驱动初始化

驱动程序会使用module_init向内核注册一个函数，驱动程序被加载时，内核会调用这个函数。以igb网卡为例：

在这里插入图片描述

// drivers/net/ethernet/intel/igb/igb_main.c
static struct pci_driver igb_driver = {
	.name     = igb_driver_name,
	.id_table = igb_pci_tbl,
	.probe    = igb_probe,
	.remove   = igb_remove,
	...
	.sriov_configure = igb_pci_sriov_configure
}

static int __init igb_init_module(void)
{
	...
	ret = pci_register_driver(&igb_driver);
	return ret;
}
module_init(igb_init_module);

驱动调用pci_register_driver后Linux内核可以获取该驱动相关信息，比如name和probe。网卡被识别后，内核会调用驱动提供的probe函数(在这里即igb_probe)。驱动probe函数执行的目的是让设备处于ready状态。

probe函数主要作用：

获取网卡MAC地址。
DMA初始化。
注册ethtool实现函数。
注册net_device_ops,netdev等变量。
初始化NAPI，注册poll函数到napi数据结构。

// file: drivers/net/ethernet/intel/igb/igb_main.c
static const struct net_device_ops igb_netdev_ops = {
    .ndo_open            = igb_open,
    .ndo_stop            = igb_close,
    .ndo_start_xmit      = igb_xmit_frame,
    .ndo_get_stats64     = igb_get_stats64,
    .ndo_set_rx_mode     = igb_set_rx_mode,
    .ndo_set_mac_address = igb_set_mac,
    .ndo_change_mtu      = igb_change_mtu,
    .ndo_do_ioctl        = igb_ioctl,
    // ...
};

也就是实现了图中的4-7步。在第5步中，网卡驱动实现了ethtool所需要的接口，也在这里完成函数地址的注册。当ethtool发起一个系统调用之后，内核会找到对应操作的回调函数。也就是ethtool 命令最后调用的都是网卡驱动函数。

NAPI（New API）是 Linux 内核中的网络设备处理机制，通过结合中断驱动和轮询模式来处理数据包。它在接收到数据包`时首先触发中断，随后禁用中断并通过软中断轮询处理多个数据包，减少中断频率和 CPU 开销，提高处理效率。

static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
	// 设置DMA
	pci_using_dac = 0;
	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
	if (!err) {
		pci_using_dac = 1;
	} else {
		err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
		if (err) {
			dev_err(&pdev->dev,
				"No usable DMA configuration, aborting\n");
			goto err_dma;
		}
	}
	// 获取MAC地址
	if (eth_platform_get_mac_address(&pdev->dev, hw->mac.addr)) {
		/* copy the MAC address out of the NVM */
		if (hw->mac.ops.read_mac_addr(hw))
			dev_err(&pdev->dev, "NVM Read Error\n");
	}

	memcpy(netdev->dev_addr, hw->mac.addr, netdev->addr_len);
	
	// 设置netdev_ops,设置ethool实现函数
	// igb_netdev_ops中包含igb_open等函数，网卡启动时会使用
	netdev->netdev_ops = &igb_netdev_ops;
	igb_set_ethtool_ops(netdev);
	
	// 其中会调用alloc_q_vector
	/* setup the private structure */
	err = igb_sw_init(adapter);
}

static int igb_alloc_q_vector(struct igb_adapter *adapter,...)
{
	......
	/* initialize NAPI */
	netif_napi_add(adapter->netdev, &q_vector->napi,
		       igb_poll, 64);      
	......
}

void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
		    int (*poll)(struct napi_struct *, int), int weight)
{
		...
		napi->poll = poll;
		napi->weight = weight;
		napi->dev = dev;
		...
}

启动网卡

启用网卡时会调用上面网卡驱动初始化中 net_device_ops提供的open函数(即igb_open)。其包括网卡启用、发包、设置MAC地址等回调函数（函数指针)。当启用一个网卡时（例如，通过ifconfig eth0 up ) ，net_device_ops变量中定义的ndo_open方法会被调用。这是一个函数指针，对于igb网卡来说，该指针指向的是igb_open方法。

在这里插入图片描述

// igb_main.c
static int __igb_open(struct net_device *netdev, bool resuming)
{
	// 分配TX描述符队列
	/* allocate transmit descriptors */
	err = igb_setup_all_tx_resources(adapter);
	if (err)
		goto err_setup_tx;

	// 分配RX描述符队列
	/* allocate receive descriptors */
	err = igb_setup_all_rx_resources(adapter);
	if (err)
		goto err_setup_rx;
		
	// 注册中断处理函数
	err = igb_request_irq(adapter);
	if (err)
		goto err_req_irq;
	
	// 启用NAPI
	for (i = 0; i < adapter->num_q_vectors; i++)
		napi_enable(&(adapter->q_vector[i]->napi));

}

以上代码中，_igb_open函数调用了igb_setup_all_tx_resources和igb_setup_all_x_resources。在调用igb_setup_all_x_resources这一步操作中，分配了RingBuffer，并建立内存和Rx队列的映射关系。(Rx和Tx队列的数量和大小可以通过ethtool进行配置。)

这里以分配RX队列代码为例：

// file: igb_main.c
static int igb_setup_all_rx_resources(struct igb_adapter *adapter)
{
	int i, err = 0;
	
    // 分配多RX队列
	for (i = 0; i < adapter->num_rx_queues; i++) {
		err = igb_setup_rx_resources(adapter->rx_ring[i]);
		if (err) {
			for (i--; i >= 0; i--)
				igb_free_rx_resources(adapter->rx_ring[i]);
			break;
		}
	}
	return err;
}

在上面的源码中，通过循环创建了若干个接收队列，下面看一下每个接收队列咱们创建出来的：

// 分配每个队列
int igb_setup_rx_resources(struct igb_ring *rx_ring)
{
	struct device *dev = rx_ring->dev;
	int size;
	
    // 1. 分配igb_rx_buffer数组内存
	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
	rx_ring->rx_buffer_info = vmalloc(size);
	if (!rx_ring->rx_buffer_info)
		goto err;
	
    // 2. 分配网卡使用DMA数组内存
	/* Round up to nearest 4K */
	rx_ring->size = rx_ring->count * sizeof(union e1000_adv_rx_desc);
	rx_ring->size = ALIGN(rx_ring->size, 4096);

	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
					   &rx_ring->dma, GFP_KERNEL);
	if (!rx_ring->desc)
		goto err;
	
    // 3. 初始化队列成员
	rx_ring->next_to_alloc = 0;
	rx_ring->next_to_clean = 0;
	rx_ring->next_to_use = 0;
	......
}

从上述源码可以看到，实际上一个RingBuffer的内部不是仅有一个环形队列数组，而是有两个，如图2.9所示。

igb_rx_buffer数组：这个数组是内核使用的，通过vzalloc申请的。
e1000_adv_rx desc数组：这个数组是网卡硬件使用的，通过dma_alloc_coherent分配。

在这里插入图片描述
接下来看中断处理函数注册部分，会先检查是否支持MSIX中断，如果不支持或设置失败则使用MSI中断。MSIX情况下注册的硬中断处理函数为igb_msix_ring。

MSI（Message Signaled Interrupts）和 MSI-X 是用于替代传统中断（如线性中断）的机制。MSI 通过写入特定内存地址来触发中断，减少了对物理线路的依赖，提供更高效的中断处理。MSI-X 是 MSI 的扩展版本，允许设备配置多个中断向量，从而支持更复杂的多队列和多核处理，进一步提升性能和灵活性。

static int igb_request_irq(struct igb_adapter *adapter)
{
	struct net_device *netdev = adapter->netdev;
	struct pci_dev *pdev = adapter->pdev;
	int err = 0;

	if (adapter->flags & IGB_FLAG_HAS_MSIX) {
		err = igb_request_msix(adapter);
		if (!err)
			goto request_done;
		/* fall back to MSI */
		...
	}
	// MSI
	...
request_done:
	return err;
}

static int igb_request_msix(struct igb_adapter *adapter)
{
    // 为每个队列注册中断
    for (i = 0; i < adapter->num_q_vectors; i++) {
        struct igb_q_vector *q_vector = adapter->q_vector[i];

        vector++;
        q_vector->itr_register = adapter->io_addr + E1000_EITR(vector);
        err = request_irq(adapter->msix_entries[vector].vector,
                    igb_msix_ring, 0, q_vector->name,
                    q_vector);
    }
}

可以看到MSI-X方式下可以为网卡每个接收队列都注册中断，从而可以在网卡中断层面设置让收到的包由不同CPU处理，即修改中断的CPU亲和性，指定中断由特定CPU集处理。

小结

Linux启动中涉及网络的大致过程如下：

创建了ksoftirqd内核线程来处理软中断
初始化网络子系统为每个cpu初始化收发包使用数据结构soft_net，并且注册RX,TX软中断处理函数
协议栈注册将ip协议处理函数注册到ptype_base数据结构中，tcp,udp协议处理函数注册到inet_protos数据结构中
网卡驱动初始化使网卡ready，注册了ethtool实现函数，初始化NAPI
启动网卡则分配RX和TX队列内存，注册硬中断处理函数。可以接收数据包了

接收数据

硬中断处理

数据帧到达网卡，网卡将数据帧DMA到分配给它的内存中，发起硬中断通知CPU数据包到达。

当RingBuffer满的时候,新来的数据包将被丢弃。使用ifconfig命令查看网卡的时候,可以看到里面有个overruns，表示因为环形队列满被丢弃的包数。如果发现有丢包，可能需要通过ethtool命令来加大环形队列的长度。

在这里插入图片描述

static irqreturn_t igb_msix_ring(int irq, void *data)
{
	struct igb_q_vector *q_vector = data;
	/* Write the ITR value calculated from the previous interrupt. */
	igb_write_itr(q_vector);
	napi_schedule(&q_vector->napi);
	return IRQ_HANDLED;
}

igb_write_itr记录硬件中断频率，追踪napi_schedule调用可以发现会调用___napi_schedule(this_cpu_ptr(&softnet_data), n)。将驱动传来的poll_list添加到cpu变量softnet_data中的poll_list。

// file: net/core/dev.c

static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

_raise_softirq_irqoff 触发了一个软中断NET_RX_SOFTIRQ，这个所谓的触发过程只是对一个变量进行了一次或运算而已。

// file: kernel/softirq.c
void __raise_softirq_irqoff(unsigned int nr)
{
	trace_softirq_raise(nr);
	or_softirq_pending(1UL << nr);
}

// file: include/linux/interrupt.h
// 设置当前CPU软中断
#define or_softirq_pending(x)	(__this_cpu_or(local_softirq_pending_ref, (x)))

Linux在硬中断里只完成简单必要的工作，剩下的大部分的处理都是转交给软中断的。通过以上代码可以看到，硬中断处理过程非常短，只是记录了一个寄存器，修改了一下CPU的poll_list，然后发出一个软中断。

ksoftirqd内核线程处理软中断

网络包的接收处理过程主要都在ksoftirqd内核线程中完成，软中断都是在这里处理的：

在这里插入图片描述检测软中断标记时使用ksoftirqd_should_run函数。

// file: kernel/softirq.c
static int ksoftirqd_should_run(unsigned int cpu)
{
	return local_softirq_pending();
}

// 读取当前CPU软中断标记
#define local_softirq_pending()	(__this_cpu_read(local_softirq_pending_ref))

检测到软中断标记，会执行对应处理程序：

// file: kernel/softirq.c

static void run_ksoftirqd(unsigned int cpu)
{
	local_irq_disable();
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable();
		cond_resched();
		return;
	}
	local_irq_enable();
}

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
	h = softirq_vec;
	while ((softirq_bit = ffs(pending))) {
		......

		trace_softirq_entry(vec_nr);
		// 执行对应处理函数
		h->action(h);
		trace_softirq_exit(vec_nr);
		
		h++;
		pending >>= softirq_bit;
	}
}

硬中断中注册软中断时是修改当前cpu的相关变量，而内核线程处理软中断时也是通过读取当前cpu相应变量，所以硬中断在哪个cpu上被处理，软中断也会在对应cpu上被处理。

如果发现软中断集中在一个核上，应该考虑通过修改硬中断亲和性将其打散到不同cpu上。

处理RX软中断的函数是在网络子系统初始化时注册的net_rx_action函数,获取当前cpu的softnet_data成员，获取其poll_list进行处理。time_limit（时间限制）,budget（处理数据包数量限制）用来控制主动退出，防止net_rx_action占用cpu过长时间。

// file: net/core/dev.c

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget; // 最多处理多少数据包
	LIST_HEAD(list);
	LIST_HEAD(repoll);

	local_irq_disable();
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n;

		n = list_first_entry(&list, struct napi_struct, poll_list);
		// napi_poll中会删除节点
		budget -= napi_poll(n, &repoll);

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}
	...
}

net_rx_action遍历poll_list并执行napi_poll函数，其中会调用驱动注册到napi数据结构的poll函数(igb_poll函数)。

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
    ...
    if (test_bit(NAPI_STATE_SCHED, &n->state)) {
        // 调用驱动注册的poll函数
		work = n->poll(n, weight);
		trace_napi_poll(n, work, weight);
	}
    ...
}

igb_poll函数中重点是对igb_clean_rx_irq的调用，读取RX描述符队列，根据RX描述符信息将数据帧从RingBuffer中取下，放入skb并释放对应内存(之后会重新分配)。

static int igb_poll(struct napi_struct *napi, int budget)
{
	......
	if (q_vector->tx.ring)
		clean_complete = igb_clean_tx_irq(q_vector, budget);

	if (q_vector->rx.ring) {
		int cleaned = igb_clean_rx_irq(q_vector, budget);

		work_done += cleaned;
		if (cleaned >= budget)
			clean_complete = false;
	}
	......
}
static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
{
	......

    while (likely(total_packets < budget)) {
        union e1000_adv_rx_desc *rx_desc;
        struct igb_rx_buffer *rx_buffer;
        unsigned int size;

        rx_desc = IGB_RX_DESC(rx_ring, rx_ring->next_to_clean);
        size = le16_to_cpu(rx_desc->wb.upper.length);
        rx_buffer = igb_get_rx_buffer(rx_ring, size);

        /* retrieve a buffer from the ring */
        if (skb)
            igb_add_rx_frag(rx_ring, rx_buffer, skb, size);
        ...

        igb_put_rx_buffer(rx_ring, rx_buffer);
        cleaned_count++;

        /* fetch next buffer in frame if non-eop */
        if (igb_is_non_eop(rx_ring, rx_desc))
            continue;

        /* verify the packet layout is correct */
        if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
            skb = NULL;
            continue;
        }

        /* populate checksum, timestamp, VLAN, and protocol */
        igb_process_skb_fields(rx_ring, rx_desc, skb);

        // 在该函数中送入协议栈
        napi_gro_receive(&q_vector->napi, skb);
        ...
    }
    // 将释放的内存重新分配，放回ring_buffer
    if (cleaned_count)
        igb_alloc_rx_buffers(rx_ring, cleaned_count);
}

上文代码中igb_clean_rx_irq函数的while循环中，igb_is_non_eop检测是否到数据帧结尾，数据帧有可能较大占用多个数据块，由多个描述符描述，将同一数据帧放入同一skb_buff。之后接下来进入napi_gro_receive函数进行GRO处理。

// File: net/core/dev.c
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) {
    skb_gro_reset_offset(skb);
    return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}

GRO允许将多个小的接收数据包合并为一个较大的数据包。这种合并操作发生在内核网络栈中，可以减少每个数据包的处理开销。

最后在napi_skb_finish中，数据包被送到协议栈。

static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
	switch (ret) {
	case GRO_NORMAL:
		if (netif_receive_skb_internal(skb))
			ret = GRO_DROP;
		break;
	......
}

网络协议栈处理

netif_receive_skb函数会根据包的协议进行处理，最终会调用__netif_receive_skb_core函数处理。，假如是UDP包，将包依次送到 ip_rcv、udp_rcv等协议处理函数中进行处理。

在这里插入图片描述 tcpdump用到的协议会注册到ptype_all上，在这里会将数据包(sk_buff)传递给所有注册的协议，tcpdump就是这样抓包的。之后交由注册到ptype_base的对应协议处理函数处理。

// file: net/core/dev.c
int netif_receive_skb(struct sk_buff *skb) {
    // RPS处理逻辑，先忽略
    return netif_receive_skb(skb); 
}

static int _netif_receive_skb(struct sk_buff *skb) {
	......
	
    ret = ___netif_receive_skb_core(skb, false); 
    return ret; 
}

static int _netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) {
	......
	
    struct ptype *pt_prev = NULL;
    int ret;

    // 遍历所有抓包类型
    list_for_each_entry_rcu(ptype, &ptype_all, list) {
        // 检查设备是否匹配
        if (!ptype->dev || ptype->dev == skb->dev) {
            // 如果有前一个ptype，交付skb
            if (pt_prev) {
                ret = deliver_skb(skb, pt_prev, orig_dev);
            }
            pt_prev = ptype; // 更新前一个ptype
        }
    }
	......
    // 根据类型遍历抓包类型
    list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
        // 检查类型和设备
        if (ptype->type == type &&
            (!ptype->dev || ptype->dev == skb->dev || ptype->dev == orig_dev)) {
            // 如果有前一个ptype，交付skb
            if (pt_prev) {
                ret = deliver_skb(skb, pt_prev, orig_dev);
            }
            pt_prev = ptype; // 更新前一个ptype
        }
    }

    return ret; // 返回结果
}

tcpdump是通过虚拟协议的方式工作的，它会将抓包函数以协议的形式挂到ptype_all上。设备层遍历所有的“协议”，这样就能抓到数据包来供我们查看了。tcpdump会执行到packet_create。

// file: net/packet/af_packet.c

static int packet_create(struct net *net, struct socket *sock, ...) {
    po->prot_hook.func = packet_rcv;
    
    if (sock->type == SOCK_PACKET)
        po->prot_hook.func = packet_rcv_spkt;
        
    po->prot_hook.af_packet_priv = sk;
    register_prot_hook(sk);
}

register_prot_hook函数会把tcpdump用到的“协议”挂到ptype_all上。

接着_netif_receive_skb_core函数取出protocol，它会从数据包中取出协议信息，然后遍历注册在这个协议上的回调函数列表。ptype_base是一个哈希表，在前面的“协议栈注册”部分提到过。ip_rcv函数地址就是存在这个哈希表中的。

// file: net/core/dev.c

static inline int deliver_skb(struct sk_buff *skb,
                               struct packet_type *pt_prev,
                               struct net_device *orig_dev)
{
    ......
    
    return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

在其中会调用到在协议栈注册时的函数，对于ip包来讲会进入到ip_rcv。

IP层处理

ip_rcv函数如下，其中NF_HOOK是钩子函数，是netfilter的过滤点，而iptables就是基于netfilter的，iptables设置的规则就是在这些地方被执行。

// file: net/ipv4/ip_input.c

/*
 * IP receive entry point
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
	   struct net_device *orig_dev)
{
	......
	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
		       net, NULL, skb, dev, NULL,
		       ip_rcv_finish);
}

执行完钩子函数后会执行最后一个参数提供的函数ip_rcv_finish。其中一般情况最后调用dst_input()函数，将数据包从网络层传输到传输层。

static int ip_rcv_finish(struct sk_buff *skb) {
	......
	
    if (!skb_dst(skb)) {
        int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev);
    }
	......
	    
    return dst_input(skb);
}

跟踪ip_route_input_noref后看到它又调用了ip_route_input_mc。在ip_route_input_mc中，函数ip_local_deliver 被赋值给了dst.input。

// File: net/ipv4/route.c

static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                              u8 tos, struct net_device *dev, int our) {
	......
    if (our) {
        rth->dst.input = ip_local_deliver;
        rth->rt_flags |= RTCF_LOCAL;
    }
	......
}

然后回到ip_rcv_finish的dst_input，其中调用的input方法就是ip_route_input_mc中赋值的ip_local_deliver

// file: include/net/dst.h
static inline int dst_input(struct sk_buff *skb) {
    return skb_dst(skb)->input(skb);
}

ip_local_deliver函数将接收到的 IP 数据包传递给本地协议栈的上层处理。inet_protos中保存着tcp_v4_rcv和udp_rcv的函数地址。这里将会根据包中的协议类型选择分发，在这里skb包将会进一步被派送到更上层的协议中，UDP和TCP。

// file: net/ipv4/ip_input.c
int ip_local_deliver(struct sk_buff *skb) {
    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }
    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish);
}

// file: net/ipv4/ip_input.c
static int ip_local_deliver_finish(struct sk_buff *skb) {
	......
    int protocol = ip_hdr(skb)->protocol;
    const struct net_protocol *ipprot;

    ipprot = rcu_dereference(inet_protos[protocol]); //确定函数
    if (ipprot != NULL) {
        return ipprot->handler(skb);
    }
}