libpcap获取数据包

一、用户空间

以Linux以及TPACKET_V3为例。
调用pcap_dispatch获取数据包，然后回调用户传递的数据包处理函数。
read_op实际调用的是pcap_read_linux_mmap_v3

// pcap.c
int
pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{
	return (p->read_op(p, cnt, callback, user));
}

在这里插入图片描述

1.1 获取block

1.1.1 根据offset获取一个block

#define RING_GET_FRAME_AT(h, offset) (((union thdr **)h->buffer)[(offset)])
#define RING_GET_CURRENT_FRAME(h) RING_GET_FRAME_AT(h, h->offset)

h.raw = RING_GET_CURRENT_FRAME(handle);

1.1.2 判断当前block的状态

根据block_status值判断block的实际状态，主要关注两个值，
TP_STATUS_KERNEL - block正在内核使用，用户不能使用
TP_STATUS_USER - block已经由内核填充了数据，用户可以读取，内核不能使用

if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {
	...
}

1.2 读取/处理数据

如果当前block的status为TP_STATUS_USER，则开始读取数据

...

//偏移到实际的数据包部分
handlep->current_packet = h.raw + h.h3->hdr.bh1.offset_to_first_pkt;
//当前block中数据包的个数
handlep->packets_left = h.h3->hdr.bh1.num_pkts;

while (packets_to_read-- && !handle->break_loop) {
			struct tpacket3_hdr* tp3_hdr = (struct tpacket3_hdr*) handlep->current_packet;
			ret = pcap_handle_packet_mmap(
					handle,
					callback,
					user,
					handlep->current_packet,
					tp3_hdr->tp_len,
					tp3_hdr->tp_mac,
					tp3_hdr->tp_snaplen,
					tp3_hdr->tp_sec,
					handle->opt.tstamp_precision == PCAP_TSTAMP_PRECISION_NANO ? tp3_hdr->tp_nsec : tp3_hdr->tp_nsec / 1000,
					VLAN_VALID(tp3_hdr, &tp3_hdr->hv1),
					tp3_hdr->hv1.tp_vlan_tci,
					VLAN_TPID(tp3_hdr, &tp3_hdr->hv1));
			...
			
			//移动到下一个包
			handlep->current_packet += tp3_hdr->tp_next_offset;
			handlep->packets_left--;
}
...

回调用户处理函数

/* handle a single memory mapped packet */
static int pcap_handle_packet_mmap(
		pcap_t *handle,
		pcap_handler callback,
		u_char *user,
		unsigned char *frame,
		unsigned int tp_len,
		unsigned int tp_mac,
		unsigned int tp_snaplen,
		unsigned int tp_sec,
		unsigned int tp_usec,
		int tp_vlan_tci_valid,
		__u16 tp_vlan_tci,
		__u16 tp_vlan_tpid)
{
	...
	
	/* pass the packet to the user */
	callback(user, &pcaphdr, bp);

	return 1;
}

1.3 "释放"当前block

当前block的数据包处理完成后，需要将当前block归还给内核，让内核可以继续写数据，只是将状态值设置为TP_STATUS_KERNEL即可。

if (handlep->packets_left <= 0) {
	h.h3->hdr.bh1.block_status = TP_STATUS_KERNEL;
	...
	/* next block */
	if (++handle->offset >= handle->cc)
		handle->offset = 0;

	handlep->current_packet = NULL;
}

1.4 等待数据

因为block是一个循环队列,只要当前block的状态是TP_STATUS_KERNEL则说明后面都没有数据，只能等待。
libpcap通过poll进行数据的等待，其中fd则是最开始创建的socket。

if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {
	ret = pcap_wait_for_frames_mmap(handle);
	if (ret) {
		return ret;
	}
}

static int pcap_wait_for_frames_mmap(pcap_t *handle)
{
	struct pcap_linux *handlep = handle->priv;
	...
	struct pollfd pollinfo;
	int ret;

	pollinfo.fd = handle->fd;
	pollinfo.events = POLLIN;

	do {
		ret = poll(&pollinfo, 1, handlep->poll_timeout);
		
		...
	} while (ret < 0);

1.4.1 阻塞模式

默认是阻塞模式，并且TPACKET3下超时为-1（永不超时）, 当没有流量时，将不会被唤醒。

static void
set_poll_timeout(struct pcap_linux *handlep) {
...
	if (handlep->tp_version == TPACKET_V3 && !broken_tpacket_v3)
			handlep->poll_timeout = -1;	/* block forever, let TPACKET_V3 wake us up */
		else
		...
}

如何提前唤醒？

在 libpcap-1.10.4之前无法提前唤醒，只能等待数据的到来，在新版本中增加了一个fd，专门用来提前唤醒。
https://github.com/the-tcpdump-group/libpcap/pull/741/commits/5c8b13d3e87542527ed9a3a79fb0f9b2edb74df1

在创建handle时，同时创建了poll_breakloop_fd

pcap_t *
pcap_create_interface(const char *device, char *ebuf)
{
	pcap_t *handle;

	handle = PCAP_CREATE_COMMON(ebuf, struct pcap_linux);
	if (handle == NULL)
		return NULL;
	
	...
	
	struct pcap_linux *handlep = handle->priv;
	handlep->poll_breakloop_fd = eventfd(0, EFD_NONBLOCK);

	return handle;
}

激活handle时设置对应的break_loop callback

static int
pcap_activate_linux(pcap_t *handle)
	...
	handle->breakloop_op = pcap_breakloop_linux;
	...
}

poll时将poll_breakloop_fd也监听

static int pcap_wait_for_frames_mmap(pcap_t *handle)
{
	struct pcap_linux *handlep = handle->priv;
	...
	struct pollfd pollinfo[2];
	int numpollinfo;
	pollinfo[0].fd = handle->fd;
	pollinfo[0].events = POLLIN;
	...
		pollinfo[1].fd = handlep->poll_breakloop_fd;
		pollinfo[1].events = POLLIN;
		numpollinfo = 2;
	...

调用pcap_breakloop通知唤醒

void
pcap_breakloop(pcap_t *p)
{
	p->breakloop_op(p);
}

static void pcap_breakloop_linux(pcap_t *handle)
{
	pcap_breakloop_common(handle);
	struct pcap_linux *handlep = handle->priv;

	uint64_t value = 1;
	/* XXX - what if this fails? */
	if (handlep->poll_breakloop_fd != -1)
		(void)write(handlep->poll_breakloop_fd, &value, sizeof(value));
}

poll被poll_breakloop_fd唤醒

if (pollinfo[1].revents & POLLIN) {
	ssize_t nread;
	uint64_t value;

	nread = read(handlep->poll_breakloop_fd, &value,
	    sizeof(value));
	...

	if (handle->break_loop) {
		handle->break_loop = 0;
		return PCAP_ERROR_BREAK;
	}
}

1.4.2 非阻塞模式

没有数据时，立刻返回,通过如下API进行设置

int
pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf)

二、内核空间

在这里插入图片描述

内核在接收到数据包时，将调用相应的处理函数进行处理

__netif_receive_skb_core()
	deliver_skb()

static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
	...
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

而pt_prev->func实际为在设置rx ring时设置的函数tpacket_rcv

2.1 判断是否有可用空间

如果没有空间了，则当前数据包被丢弃。

/* If we are flooded, just give up */
	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
		atomic_inc(&po->tp_drops);
		goto drop_n_restore;
	}

2.2 获取一个可用的block

h.raw = packet_current_rx_frame(po, skb,
					TP_STATUS_KERNEL, (macoff+snaplen));

2.3 拷贝数据到block中

skb_copy_bits(skb, 0, h.raw + macoff, snaplen);

2.4 更新block属性

switch (po->tp_version) {
	...
	case TPACKET_V3:
		h.h3->tp_status |= status;
		h.h3->tp_len = skb->len;
		h.h3->tp_snaplen = snaplen;
		h.h3->tp_mac = macoff;
		h.h3->tp_net = netoff;
		h.h3->tp_sec  = ts.tv_sec;
		h.h3->tp_nsec = ts.tv_nsec;
		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
		hdrlen = sizeof(*h.h3);
		break;
	default:
		BUG();
	}

2.5 何时更新block状态

1. 当block写满时

在获取每次获取block时，将判断当前block是否有足够的空间写入当前的数据包

static void *packet_current_rx_frame(struct packet_sock *po,
					    struct sk_buff *skb,
					    int status, unsigned int len)
{
	char *curr = NULL;
	switch (po->tp_version) {
		...
	case TPACKET_V3:
		return __packet_lookup_frame_in_block(po, skb, len);
		...
	}
}
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
					    struct sk_buff *skb,
					    unsigned int len
					    )
{
	...
	/* 空间足够，直接返回 */
	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
		prb_fill_curr_block(curr, pkc, pbd, len);
		return (void *)curr;
	}

空间不足时，将当前block 关闭（即将状态设置为TP_STATUS_USER），并通知socket fd有数据, 用户空间的poll则会被唤醒。

	/* Ok, close the current block */
	prb_retire_current_block(pkc, po, 0);

static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
		struct packet_sock *po, unsigned int status)
{
	...
		prb_close_block(pkc, pbd, po, status);
	...
}

static void prb_close_block(struct tpacket_kbdq_core *pkc1,
		struct tpacket_block_desc *pbd1,
		struct packet_sock *po, unsigned int stat)
{
	__u32 status = TP_STATUS_USER | stat;
...
	/* Flush the block */
	prb_flush_block(pkc1, pbd1, status);

	//通知socket 有数据，poll将被唤醒
	sk->sk_data_ready(sk);

	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
}

static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
		struct tpacket_block_desc *pbd1, __u32 status)
{
	/* Now update the block status. */
	BLOCK_STATUS(pbd1) = status;
}

2. 当block超时时

当流量很小时，block一直都不会被写满，因此数据一直停留在block中，上层应用无法获取数据；因此增加了一个timer.

在建立ring buf时初始化timer并设置超时处理函数

static void init_prb_bdqc(struct packet_sock *po,
			struct packet_ring_buffer *rb,
			struct pgv *pg_vec,
			union tpacket_req_u *req_u)
{
	...
	prb_setup_retire_blk_timer(po);
	...
}

static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
	struct tpacket_kbdq_core *pkc;

	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
		    0);
	pkc->retire_blk_timer.expires = jiffies;
}

timer超时，调用回调函数，关闭当前block

static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
{
	...
	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
		...
			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
		...	
	}
...
}