一、用户空间
在通过socket(AF_PACKET,…)创建fd后,进行接收队列的建立
//pcap-linux.c
static int
pcap_activate_linux(pcap_t *handle) {
...
ret = setup_mmapped(handle, &status);
...
}
1.1 设置默认ring bufer size
static int
setup_mmapped(pcap_t *handle, int *status)
{
...
//1.
if (handle->opt.buffer_size == 0) {
/* by default request 2M for the ring buffer */
handle->opt.buffer_size = 2*1024*1024;
}
...
}
1.2 确定tpacket版本
不同的版本格式略有不同。
static int
setup_mmapped(pcap_t *handle, int *status)
{
...
//1.
...
//2.
ret = prepare_tpacket_socket(handle);
...
}
1.3 创建ring
static int
setup_mmapped(pcap_t *handle, int *status)
{
...
//1.
...
//2.
...
//3.
ret = create_ring(handle, status);
...
}
计算block_size,frame_size等
以TPACKET3为例
static int
create_ring(pcap_t *handle, int *status)
{
struct pcap_linux *handlep = handle->priv;
unsigned i, j, frames_per_block;
...
struct tpacket_req3 req;
...
socklen_t len;
unsigned int sk_type, tp_reserve, maclen, tp_hdrlen, netoff, macoff;
unsigned int frame_size;
...
switch (handlep->tp_version) {
case TPACKET_V2:
...
break;
case TPACKET_V3:
req.tp_frame_size = MAXIMUM_SNAPLEN;
req.tp_frame_nr = (handle->opt.buffer_size + req.tp_frame_size - 1)/req.tp_frame_size;
break;
default:
...
*status = PCAP_ERROR;
return -1;
}
req.tp_block_size = getpagesize();
while (req.tp_block_size < req.tp_frame_size)
req.tp_block_size <<= 1;
...
请求内核创建接收队列ring buffer
...
frames_per_block = req.tp_block_size/req.tp_frame_size;
...
/* ask the kernel to create the ring */
retry:
req.tp_block_nr = req.tp_frame_nr / frames_per_block;
/* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */
req.tp_frame_nr = req.tp_block_nr * frames_per_block;
...
if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
(void *) &req, sizeof(req))) {
...
}
将内核创建的ring buffer映射到用户空间
/* memory map the rx ring */
handlep->mmapbuflen = req.tp_block_nr * req.tp_block_size;
handlep->mmapbuf = mmap(0, handlep->mmapbuflen,
PROT_READ|PROT_WRITE, MAP_SHARED, handle->fd, 0);
...
初始化用户态的映射空间头
...
/* fill the header ring with proper frame ptr*/
handle->offset = 0;
for (i=0; i<req.tp_block_nr; ++i) {
u_char *base = &handlep->mmapbuf[i*req.tp_block_size];
for (j=0; j<frames_per_block; ++j, ++handle->offset) {
RING_GET_CURRENT_FRAME(handle) = base;
base += req.tp_frame_size;
}
}
handle->bufsize = req.tp_frame_size;
handle->offset = 0;
...
二、内核空间
2.1 创建 rx ring
// net/packet/af_packet.c
static int
packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
int ret;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
switch (optname) {
...
case PACKET_RX_RING:
case PACKET_TX_RING:
{
union tpacket_req_u req_u;
int len;
lock_sock(sk);
switch (po->tp_version) {
case TPACKET_V1:
case TPACKET_V2:
len = sizeof(req_u.req);
break;
case TPACKET_V3:
default:
len = sizeof(req_u.req3);
break;
}
if (optlen < len) {
ret = -EINVAL;
} else {
if (copy_from_user(&req_u.req, optval, len))
ret = -EFAULT;
else
ret = packet_set_ring(sk, &req_u, 0,
optname == PACKET_TX_RING);
}
release_sock(sk);
return ret;
}
default:
return -ENOPROTOOPT;
}
...
}
分配空间
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring)
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
...
int was_running, order = 0;
...
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
...
order = get_order(req->tp_block_size);
pg_vec = alloc_pg_vec(req, order);
初始化
...
switch (po->tp_version) {
case TPACKET_V3:
/* Block transmit is not supported yet */
if (!tx_ring) {
init_prb_bdqc(po, rb, pg_vec, req_u);
} else {
...
}
break;
default:
break;
}
将新空间挂入队列上
mutex_lock(&po->pg_vec_lock);
if (closing || atomic_read(&po->mapped) == 0) {
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
spin_unlock_bh(&rb_queue->lock);
swap(rb->pg_vec_order, order);
swap(rb->pg_vec_len, req->tp_block_nr);
rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
po->prot_hook.func = (po->rx_ring.pg_vec) ?
tpacket_rcv : packet_rcv;
skb_queue_purge(rb_queue);
if (atomic_read(&po->mapped))
pr_err("packet_mmap: vma is busy: %d\n",
atomic_read(&po->mapped));
}
mutex_unlock(&po->pg_vec_lock);
2.2 将队列空间映射到用户空间
static int packet_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
unsigned long size, expected_size;
struct packet_ring_buffer *rb;
unsigned long start;
int err = -EINVAL;
int i;
...
start = vma->vm_start;
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
if (rb->pg_vec == NULL)
continue;
for (i = 0; i < rb->pg_vec_len; i++) {
struct page *page;
void *kaddr = rb->pg_vec[i].buffer;
int pg_num;
for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
page = pgv_to_page(kaddr);
err = vm_insert_page(vma, start, page);
if (unlikely(err))
goto out;
start += PAGE_SIZE;
kaddr += PAGE_SIZE;
}
}
}
atomic_inc(&po->mapped);
vma->vm_ops = &packet_mmap_ops;
err = 0;
out:
mutex_unlock(&po->pg_vec_lock);
return err;
}