DPDK开发之KNI模块代码实现
- 背景
- KNI实现原理 -- ifreq
- 代码实现
- 总结
背景
在DPDK开发的时候,如果有些协议不想处理,只处理关注的协议,可以把其他协议写回内核,让内核处理。此时的DPDK就起到分发的作用,类似一个过滤器。
KNI实现原理 – ifreq
主要利用内核的/dev/net/tun。做VPN时也会用到这个设备文件。
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <errno.h>
#include <sys/ioctl.h>
int tun_alloc(char *dev)
{
struct ifreq ifr;
memset(&ifr,0,sizeof(ifr));
int fd=open("/dev/net/tun",O_RDWR);
if(fd<0)
return -1;
// IFF_TAP针对的是以太网协议,需要传入MAC;TUN主要针对IP层协议
ifr.ifr_flags=IFF_TAP|IFF_NO_PI;
memcpy(ifr.ifr_name,dev,strlen(dev));
int err;
printf("fd = %d, dev = %s, len = %ld\n",fd,dev,strlen(dev));
// 设置进去
if((err=ioctl(fd,TUNSETIFF,(char *)&ifr))<0)
{
printf("ioctl fail(%d): %s\n",err,strerror(errno));
close(fd);
return err;
}
return fd;
}
int main()
{
int code = tun_alloc("MyDev");
printf("return code %d\n",code);
getchar();
return 0;
}
特别注意,ifr.ifr_name不能有空格。
编译:
gcc -o ifr ifr.c
执行后,使用如下命名查询:
ifconfig -a
可以看到多了MyDev。
MyDev Link encap:以太网 硬件地址 c2:44:70:f2:79:f9
BROADCAST MULTICAST MTU:1500 跃点数:1
接收数据包:0 错误:0 丢弃:0 过载:0 帧数:0
发送数据包:0 错误:0 丢弃:0 过载:0 载波:0
碰撞:0 发送队列长度:1000
接收字节:0 (0.0 B) 发送字节:0 (0.0 B)
ens33 Link encap:以太网 硬件地址 00:0c:29:79:9b:f7
inet 地址:192.168.0.106 广播:192.168.0.255 掩码:255.255.255.0
inet6 地址: fe80::b608:7cba:aa19:e2d/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 跃点数:1
接收数据包:7543 错误:0 丢弃:0 过载:0 帧数:0
发送数据包:4518 错误:0 丢弃:0 过载:0 载波:0
碰撞:0 发送队列长度:1000
接收字节:3014986 (3.0 MB) 发送字节:657222 (657.2 KB)
lo Link encap:本地环回
inet 地址:127.0.0.1 掩码:255.0.0.0
inet6 地址: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:65536 跃点数:1
接收数据包:304 错误:0 丢弃:0 过载:0 帧数:0
发送数据包:304 错误:0 丢弃:0 过载:0 载波:0
碰撞:0 发送队列长度:1000
接收字节:25426 (25.4 KB) 发送字节:25426 (25.4 KB)
这就是kni的实现原理,由两部分组成:
(1)对外提供了一个字符设备,通过ioctl()操作。
(2)底层是一个网口。
代码实现
- 定义全局的KNI变量:struct rte_kni *。
- KNI初始化:rte_kni_init(…)。
- 完善struct rte_kni_conf,用于写入内核中。
- 完善struct rte_kni_ops。
- 实现一个config_network_if类型的函数,用于网络的up、down操作。
- 分配KNI,保存到全局变量中:rte_kni_alloc(…)。
- 把包发送到内核中:rte_kni_tx_brust(…)。
- 特别注意,要打开混杂模式:rte_eth_promiscuous_enable(…)。
- 这里演示了把包发送到内核,并没有从内核中抓取返回的包发送出去。
(dpdk_udp.c)
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_mbuf.h>
#include <rte_kni.h>
#include <stdio.h>
#include <arpa/inet.h>
#define ENABLE_SEND 1
#define ENABLE_KNI 1
#define NUM_MBUFS (4096-1)
#define BURST_SIZE 32
int gDpdkPortId = 0; //
static const struct rte_eth_conf port_conf_default = {
.rxmode = {.max_rx_pkt_len = RTE_ETHER_MAX_LEN }
};
#if ENABLE_KNI
struct rte_kni *global_kni = NULL;
#endif
#if ENABLE_SEND
// sender
static uint32_t gSrcIp;
static uint32_t gDstIp;
static uint16_t gSrcPort;
static uint32_t gDstPort;
static uint8_t gSrcMac[RTE_ETHER_ADDR_LEN];
static uint8_t gDstMac[RTE_ETHER_ADDR_LEN];
#endif
//
static void ng_init_port(struct rte_mempool *mbuf_pool) {
//1 count avail
uint16_t nb_sys_ports= rte_eth_dev_count_avail(); //
if (nb_sys_ports == 0) {
rte_exit(EXIT_FAILURE, "No Supported eth found\n");
}
//1
struct rte_eth_dev_info dev_info;
rte_eth_dev_info_get(gDpdkPortId, &dev_info); //
//1
const int num_rx_queues = 1;
const int num_tx_queues = 1;
struct rte_eth_conf port_conf = port_conf_default;
rte_eth_dev_configure(gDpdkPortId, num_rx_queues, num_tx_queues, &port_conf);
//1 rx queue setup
if (rte_eth_rx_queue_setup(gDpdkPortId, 0 , 1024,
rte_eth_dev_socket_id(gDpdkPortId),NULL, mbuf_pool) < 0) {
rte_exit(EXIT_FAILURE, "Could not setup RX queue\n");
}
#if ENABLE_SEND
struct rte_eth_txconf txq_conf = dev_info.default_txconf;
txq_conf.offloads = port_conf.rxmode.offloads;
if (rte_eth_tx_queue_setup(gDpdkPortId, 0 , 1024,
rte_eth_dev_socket_id(gDpdkPortId), &txq_conf) < 0) {
rte_exit(EXIT_FAILURE, "Could not setup TX queue\n");
}
#endif
//1 start
if (rte_eth_dev_start(gDpdkPortId) < 0 ) {
rte_exit(EXIT_FAILURE, "Could not start\n");
}
rte_eth_promiscuous_enable( gDpdkPortId);
}
#if ENABLE_SEND
static int ng_encode_udp_pkt(uint8_t *msg, unsigned char *data, uint16_t
total_len) {
// encode
// 1 ethhdr
struct rte_ether_hdr *eth = (struct rte_ether_hdr *)msg;
rte_memcpy(eth->s_addr.addr_bytes, gSrcMac, RTE_ETHER_ADDR_LEN);
rte_memcpy(eth->d_addr.addr_bytes, gDstMac, RTE_ETHER_ADDR_LEN);
eth->ether_type = htons(RTE_ETHER_TYPE_IPV4);
// 2 iphdr
struct rte_ipv4_hdr *ip = (struct rte_ipv4_hdr *)(msg + sizeof(struct
rte_ether_hdr));
ip->version_ihl = 0x45;
ip->type_of_service = 0;
ip->total_length = htons(total_len - sizeof(struct rte_ether_hdr));
ip->packet_id = 0;
ip->fragment_offset = 0;
ip->time_to_live = 64; // ttl = 64
ip->next_proto_id = IPPROTO_UDP;
ip->src_addr = gSrcIp;
ip->dst_addr = gDstIp;
ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);
// 3 udphdr
struct rte_udp_hdr *udp = (struct rte_udp_hdr *)(msg + sizeof(struct
rte_ether_hdr) + sizeof(struct rte_ipv4_hdr));
udp->src_port = gSrcPort;
udp->dst_port = gDstPort;
uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct
rte_ipv4_hdr);
udp->dgram_len = htons(udplen);
rte_memcpy((uint8_t*)(udp+1), data, udplen);
udp->dgram_cksum = 0;
udp->dgram_cksum = rte_ipv4_udptcp_cksum(ip, udp);
struct in_addr addr;
addr.s_addr = gSrcIp;
printf(" --> src: %s:%d, ", inet_ntoa(addr), ntohs(gSrcPort));
addr.s_addr = gDstIp;
printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(gDstPort));
return 0;
}
static struct rte_mbuf * ng_send(struct rte_mempool *mbuf_pool, uint8_t *data
, uint16_t length) {
// mempool --> mbuf
const unsigned total_len = length + 42;
struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
if (!mbuf) {
rte_exit(EXIT_FAILURE, "rte_pktmbuf_alloc\n");
}
mbuf->pkt_len = total_len;
mbuf->data_len = total_len;
uint8_t *pktdata = rte_pktmbuf_mtod(mbuf, uint8_t*);
ng_encode_udp_pkt(pktdata, data, total_len);
return mbuf;
}
#endif
#if ENABLE_KNI
static int gconfig_network_if(uint16_t port_id, uint8_t if_up) {
if (!rte_eth_dev_is_valid_port(port_id)) {
return -EINVAL;
}
int ret = 0;
if (if_up) { //
rte_eth_dev_stop(port_id);
ret = rte_eth_dev_start(port_id);
} else {
rte_eth_dev_stop(port_id);
}
if (ret < 0) {
printf("Failed to start port : %d\n", port_id);
}
return 0;
}
#endif
int main(int argc, char *argv[]) {
if (rte_eal_init(argc, argv) < 0) {
rte_exit(EXIT_FAILURE, "Error with EAL init\n");
}
struct rte_mempool *mbuf_pool = rte_pktmbuf_pool_create("mbuf pool",
NUM_MBUFS,
0, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool == NULL) {
rte_exit(EXIT_FAILURE, "Could not create mbuf pool\n");
}
#if ENABLE_KNI
rte_kni_init(gDpdkPortId);
#endif
ng_init_port(mbuf_pool);
#if ENABLE_KNI
struct rte_kni_conf conf;
memset(&conf, 0, sizeof(conf));
snprintf(conf.name, RTE_KNI_NAMESIZE, "vEth%d", gDpdkPortId);
conf.group_id = gDpdkPortId;
conf.mbuf_size = RTE_MBUF_DEFAULT_BUF_SIZE;
//conf.
rte_eth_macaddr_get(gDpdkPortId, (struct rte_ether_addr*)conf.mac_addr);
rte_eth_dev_get_mtu(gDpdkPortId, &conf.mtu);
struct rte_kni_ops ops;
memset(&ops, 0, sizeof(ops));
ops.port_id = gDpdkPortId;
ops.config_network_if = gconfig_network_if;
global_kni = rte_kni_alloc(mbuf_pool, &conf, &ops);
#endif
while (1) {
struct rte_mbuf *mbufs[BURST_SIZE];
unsigned num_recvd = rte_eth_rx_burst(gDpdkPortId, 0, mbufs, BURST_SIZE);
if (num_recvd > BURST_SIZE) {
rte_exit(EXIT_FAILURE, "Error receiving from eth\n");
}
unsigned i = 0;
for (i = 0;i < num_recvd;i ++) {
struct rte_ether_hdr *ehdr = rte_pktmbuf_mtod(mbufs[i], struct
rte_ether_hdr*);
if (ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) {
continue;
}
struct rte_ipv4_hdr *iphdr = rte_pktmbuf_mtod_offset(mbufs[i], struct
rte_ipv4_hdr *,
sizeof(struct rte_ether_hdr));
if (iphdr->next_proto_id == IPPROTO_UDP) {
struct rte_udp_hdr *udphdr = (struct rte_udp_hdr *)(iphdr + 1);
#if ENABLE_SEND // echo
// mac exchange
rte_memcpy(gDstMac, ehdr->s_addr.addr_bytes, RTE_ETHER_ADDR_LEN);
rte_memcpy(gSrcMac, ehdr->d_addr.addr_bytes, RTE_ETHER_ADDR_LEN);
// ip exchange
rte_memcpy(&gSrcIp, &iphdr->dst_addr, sizeof(uint32_t));
rte_memcpy(&gDstIp, &iphdr->src_addr, sizeof(uint32_t));
// port exchange
rte_memcpy(&gSrcPort, &udphdr->dst_port, sizeof(uint16_t));
rte_memcpy(&gDstPort, &udphdr->src_port, sizeof(uint16_t));
#endif
uint16_t length = ntohs(udphdr->dgram_len);
*((char*)udphdr + length) = '\0';
struct in_addr addr;
addr.s_addr = iphdr->src_addr;
printf("src: %s:%d, ", inet_ntoa(addr), udphdr->src_port);
addr.s_addr = iphdr->dst_addr;
printf("dst: %s:%d, %s\n", inet_ntoa(addr), udphdr->src_port,
(char *)(udphdr+1));
#if ENABLE_SEND
struct rte_mbuf *txbuf = ng_send(mbuf_pool, (unsigned char*)(udphdr+1),
length);
rte_eth_tx_burst(gDpdkPortId, 0, &txbuf, 1);
#endif
rte_pktmbuf_free(mbufs[i]);
} else {
rte_kni_tx_burst(global_kni, &mbufs[i], 1);
}
}
}
}
Makefle:
# binary name
APP = dpdk_udp
# all source are stored in SRCS-y
SRCS-y := dpdk_udp.c
# Build using pkg-config variables if possible
ifeq ($(shell pkg-config --exists libdpdk && echo 0),0)
all: shared
.PHONY: shared static
shared: build/$(APP)-shared
ln -sf $(APP)-shared build/$(APP)
static: build/$(APP)-static
ln -sf $(APP)-static build/$(APP)
PKGCONF=pkg-config --define-prefix
PC_FILE := $(shell $(PKGCONF) --path libdpdk)
CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
LDFLAGS_STATIC = -Wl,-Bstatic $(shell $(PKGCONF) --static --libs libdpdk)
build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
build:
@mkdir -p $@
.PHONY: clean
clean:
rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
test -d build && rmdir -p build || true
else
ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif
# Default target, detect a build directory, by looking for a path with a .config
RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config)))))
include $(RTE_SDK)/mk/rte.vars.mk
总结
调试时,需要把/sys/devices/virtual/net/vEth0/carrier置为1。允许内核收发数据。
echo 1 > /sys/devices/virtual/net/vEth0/carrier