一、用户空间
以Linux以及TPACKET_V3为例。
调用pcap_dispatch获取数据包,然后回调用户传递的数据包处理函数。
read_op实际调用的是pcap_read_linux_mmap_v3
// pcap.c
int
pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{
return (p->read_op(p, cnt, callback, user));
}
1.1 获取block
1.1.1 根据offset获取一个block
#define RING_GET_FRAME_AT(h, offset) (((union thdr **)h->buffer)[(offset)])
#define RING_GET_CURRENT_FRAME(h) RING_GET_FRAME_AT(h, h->offset)
h.raw = RING_GET_CURRENT_FRAME(handle);
1.1.2 判断当前block的状态
根据block_status值判断block的实际状态,主要关注两个值,
TP_STATUS_KERNEL
- block正在内核使用,用户不能使用
TP_STATUS_USER
- block已经由内核填充了数据,用户可以读取,内核不能使用
if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {
...
}
1.2 读取/处理数据
如果当前block的status为TP_STATUS_USER
,则开始读取数据
...
//偏移到实际的数据包部分
handlep->current_packet = h.raw + h.h3->hdr.bh1.offset_to_first_pkt;
//当前block中数据包的个数
handlep->packets_left = h.h3->hdr.bh1.num_pkts;
while (packets_to_read-- && !handle->break_loop) {
struct tpacket3_hdr* tp3_hdr = (struct tpacket3_hdr*) handlep->current_packet;
ret = pcap_handle_packet_mmap(
handle,
callback,
user,
handlep->current_packet,
tp3_hdr->tp_len,
tp3_hdr->tp_mac,
tp3_hdr->tp_snaplen,
tp3_hdr->tp_sec,
handle->opt.tstamp_precision == PCAP_TSTAMP_PRECISION_NANO ? tp3_hdr->tp_nsec : tp3_hdr->tp_nsec / 1000,
VLAN_VALID(tp3_hdr, &tp3_hdr->hv1),
tp3_hdr->hv1.tp_vlan_tci,
VLAN_TPID(tp3_hdr, &tp3_hdr->hv1));
...
//移动到下一个包
handlep->current_packet += tp3_hdr->tp_next_offset;
handlep->packets_left--;
}
...
回调用户处理函数
/* handle a single memory mapped packet */
static int pcap_handle_packet_mmap(
pcap_t *handle,
pcap_handler callback,
u_char *user,
unsigned char *frame,
unsigned int tp_len,
unsigned int tp_mac,
unsigned int tp_snaplen,
unsigned int tp_sec,
unsigned int tp_usec,
int tp_vlan_tci_valid,
__u16 tp_vlan_tci,
__u16 tp_vlan_tpid)
{
...
/* pass the packet to the user */
callback(user, &pcaphdr, bp);
return 1;
}
1.3 "释放"当前block
当前block的数据包处理完成后,需要将当前block归还给内核,让内核可以继续写数据,只是将状态值设置为TP_STATUS_KERNEL
即可。
if (handlep->packets_left <= 0) {
h.h3->hdr.bh1.block_status = TP_STATUS_KERNEL;
...
/* next block */
if (++handle->offset >= handle->cc)
handle->offset = 0;
handlep->current_packet = NULL;
}
1.4 等待数据
因为block是一个循环队列
,只要当前block的状态是TP_STATUS_KERNEL
则说明后面都没有数据,只能等待。
libpcap通过poll
进行数据的等待,其中fd则是最开始创建的socket。
if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {
ret = pcap_wait_for_frames_mmap(handle);
if (ret) {
return ret;
}
}
static int pcap_wait_for_frames_mmap(pcap_t *handle)
{
struct pcap_linux *handlep = handle->priv;
...
struct pollfd pollinfo;
int ret;
pollinfo.fd = handle->fd;
pollinfo.events = POLLIN;
do {
ret = poll(&pollinfo, 1, handlep->poll_timeout);
...
} while (ret < 0);
1.4.1 阻塞模式
默认是阻塞模式,并且TPACKET3下超时为-1(永不超时
), 当没有流量时,将不会被唤醒。
static void
set_poll_timeout(struct pcap_linux *handlep) {
...
if (handlep->tp_version == TPACKET_V3 && !broken_tpacket_v3)
handlep->poll_timeout = -1; /* block forever, let TPACKET_V3 wake us up */
else
...
}
如何提前唤醒?
在 libpcap-1.10.4之前无法提前唤醒,只能等待数据的到来,在新版本中增加了一个fd,专门用来提前唤醒。
https://github.com/the-tcpdump-group/libpcap/pull/741/commits/5c8b13d3e87542527ed9a3a79fb0f9b2edb74df1
- 在创建handle时,同时创建了poll_breakloop_fd
pcap_t *
pcap_create_interface(const char *device, char *ebuf)
{
pcap_t *handle;
handle = PCAP_CREATE_COMMON(ebuf, struct pcap_linux);
if (handle == NULL)
return NULL;
...
struct pcap_linux *handlep = handle->priv;
handlep->poll_breakloop_fd = eventfd(0, EFD_NONBLOCK);
return handle;
}
- 激活handle时设置对应的break_loop callback
static int
pcap_activate_linux(pcap_t *handle)
...
handle->breakloop_op = pcap_breakloop_linux;
...
}
- poll时将poll_breakloop_fd也监听
static int pcap_wait_for_frames_mmap(pcap_t *handle)
{
struct pcap_linux *handlep = handle->priv;
...
struct pollfd pollinfo[2];
int numpollinfo;
pollinfo[0].fd = handle->fd;
pollinfo[0].events = POLLIN;
...
pollinfo[1].fd = handlep->poll_breakloop_fd;
pollinfo[1].events = POLLIN;
numpollinfo = 2;
...
- 调用
pcap_breakloop
通知唤醒
void
pcap_breakloop(pcap_t *p)
{
p->breakloop_op(p);
}
static void pcap_breakloop_linux(pcap_t *handle)
{
pcap_breakloop_common(handle);
struct pcap_linux *handlep = handle->priv;
uint64_t value = 1;
/* XXX - what if this fails? */
if (handlep->poll_breakloop_fd != -1)
(void)write(handlep->poll_breakloop_fd, &value, sizeof(value));
}
- poll被poll_breakloop_fd唤醒
if (pollinfo[1].revents & POLLIN) {
ssize_t nread;
uint64_t value;
nread = read(handlep->poll_breakloop_fd, &value,
sizeof(value));
...
if (handle->break_loop) {
handle->break_loop = 0;
return PCAP_ERROR_BREAK;
}
}
1.4.2 非阻塞模式
没有数据时,立刻返回,通过如下API进行设置
int
pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf)
二、内核空间
内核在接收到数据包时,将调用相应的处理函数进行处理
__netif_receive_skb_core()
deliver_skb()
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
...
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
而pt_prev->func实际为在设置rx ring时设置的函数tpacket_rcv
2.1 判断是否有可用空间
如果没有空间了,则当前数据包被丢弃。
/* If we are flooded, just give up */
if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
atomic_inc(&po->tp_drops);
goto drop_n_restore;
}
2.2 获取一个可用的block
h.raw = packet_current_rx_frame(po, skb,
TP_STATUS_KERNEL, (macoff+snaplen));
2.3 拷贝数据到block中
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2.4 更新block属性
switch (po->tp_version) {
...
case TPACKET_V3:
h.h3->tp_status |= status;
h.h3->tp_len = skb->len;
h.h3->tp_snaplen = snaplen;
h.h3->tp_mac = macoff;
h.h3->tp_net = netoff;
h.h3->tp_sec = ts.tv_sec;
h.h3->tp_nsec = ts.tv_nsec;
memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
hdrlen = sizeof(*h.h3);
break;
default:
BUG();
}
2.5 何时更新block状态
1. 当block写满时
在获取每次获取block时,将判断当前block是否有足够的空间写入当前的数据包
static void *packet_current_rx_frame(struct packet_sock *po,
struct sk_buff *skb,
int status, unsigned int len)
{
char *curr = NULL;
switch (po->tp_version) {
...
case TPACKET_V3:
return __packet_lookup_frame_in_block(po, skb, len);
...
}
}
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct sk_buff *skb,
unsigned int len
)
{
...
/* 空间足够,直接返回 */
if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
prb_fill_curr_block(curr, pkc, pbd, len);
return (void *)curr;
}
空间不足时,将当前block 关闭(即将状态设置为TP_STATUS_USER
),并通知socket fd有数据, 用户空间的poll则会被唤醒。
/* Ok, close the current block */
prb_retire_current_block(pkc, po, 0);
static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
struct packet_sock *po, unsigned int status)
{
...
prb_close_block(pkc, pbd, po, status);
...
}
static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_block_desc *pbd1,
struct packet_sock *po, unsigned int stat)
{
__u32 status = TP_STATUS_USER | stat;
...
/* Flush the block */
prb_flush_block(pkc1, pbd1, status);
//通知socket 有数据,poll将被唤醒
sk->sk_data_ready(sk);
pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
}
static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_block_desc *pbd1, __u32 status)
{
/* Now update the block status. */
BLOCK_STATUS(pbd1) = status;
}
2. 当block超时时
当流量很小时,block一直都不会被写满,因此数据一直停留在block中,上层应用无法获取数据;因此增加了一个timer.
- 在建立ring buf时初始化timer并设置超时处理函数
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
union tpacket_req_u *req_u)
{
...
prb_setup_retire_blk_timer(po);
...
}
static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
struct tpacket_kbdq_core *pkc;
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
0);
pkc->retire_blk_timer.expires = jiffies;
}
- timer超时,调用回调函数,关闭当前block
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
{
...
if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
...
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
...
}
...
}