1、什么是链接跟踪
连接跟踪,顾名思义,就是跟踪(并记录)连接的状态。一般conntrack用来指代“Connection Tracking”,即连接跟踪,是建立在 Netfilter框架之上的重要功能之一。
2、为什么需要链路跟踪
- 因为它是状态防火墙和NAT的实现基础。
3、内核中的链接跟踪
- 用于实现连接跟踪入口的hook函数以较高的优先级分别被注册到了netfitler的NF_IP_PRE_ROUTING和NF_IP_LOCAL_OUT两个hook点上;
- 用于实现连接跟踪出口的hook函数以非常低的优先级分别被注册到了netfilter的NF_IP_LOCAL_IN和NF_IP_POST_ROUTING两个hook点上。
3.1、连接跟踪的报文走向及hook注册
- 在连接跟踪模块中,一个数据包无外乎三种流程可以走:
1、发送给本机的数据包
流程:PRE_ROUTING----LOCAL_IN—本地进程
2、需要本机转发的数据包
流程:PRE_ROUTING—FORWARD—POST_ROUTING—外出
3、从本机发出的数据包
流程:LOCAL_OUT----POST_ROUTING—外出
static struct nf_hook_ops ipv4_defrag_ops[] = {
{
/*对数据进行分片检测*/
.hook = ipv4_conntrack_defrag,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
};
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
/*刚进入netfilter框架在第一个PREROUTEING链上建立连接跟踪*/
.hook = ipv4_conntrack_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
/*本机产生的数据包在OUT链上建立连接跟踪*/
.hook = ipv4_conntrack_local,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_helper,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
/*数据包最后出去在POSTROUTING链上连接跟踪确认*/
.hook = ipv4_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_helper,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
/*在LOCAL_IN链进入本机的数据连接跟踪确认*/
.hook = ipv4_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
3.2、连接跟踪的状态
enum ip_conntrack_info {
/* Part of an established connection (either direction). */
IP_CT_ESTABLISHED, //Packet是一个已建连接的一部分,在其初始方向
/* Like NEW, but related to an existing connection, or ICMP error
(in either direction). */
IP_CT_RELATED, //Packet属于一个已建连接的相关连接,在其初始方向。
/* Started a new connection to track (only
IP_CT_DIR_ORIGINAL); may be a retransmission. */
IP_CT_NEW, //
/* >= this indicates reply direction */
IP_CT_IS_REPLY,
IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY, //Packet是一个已建连接的一部分,在其响应方向
IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY, //Packet属于一个已建连接的相关连接,在其响应方向。
IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,
/* Number of distinct IP_CT types (no NEW in reply dirn). */
IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
};
3.3、链路跟踪的入口与出口
连接跟踪分入口和出口两个点。
- 入口时创建连接跟踪记录
ipv4_conntrack_defrag、ipv4_conntrack_in、ipv4_conntrack_local
整个入口的流程简述如下:对于每个到来的skb,连接跟踪都将其转换成一个tuple结构,然后用该tuple去查连接跟踪表。如果该类型的数据包没有被跟踪过,将为其在连接跟踪的hash表里建立一个连接记录项,对于已经跟踪过了的数据包则不用此操作。紧接着,调用该报文所属协议的连接跟踪模块的所提供的packet()回调函数,最后根据状态改变连接跟踪记录的状态。 - 出口时将该记录加入到连接跟踪表中
ipv4_helper、ipv4_confirm
整个出口的流程简述如下:对于每个即将离开Netfilter框架的数据包,如果用于处理该协议类型报文的连接跟踪模块提供了helper函数,那么该数据包首先会被helper函数处理,然后才去判断,如果该报文已经被跟踪过了,那么其所属连接的状态,决定该包是该被丢弃、或是返回协议栈继续传输,又或者将其加入到连接跟踪表中
3.4、连接跟踪的关键函数
3.4.1、ipv4_conntrack_defrag
这个函数主要是检测是否被分片,如果被分片就重组。
static unsigned int ipv4_conntrack_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
inet_sk(sk)->nodefrag)
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#if !IS_ENABLED(CONFIG_NF_NAT)
/*该数据包的连接跟踪选项已经建立就直接返回*/
if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
return NF_ACCEPT;
#endif
#endif
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
enum ip_defrag_users user =
nf_ct_defrag_user(state->hook, skb);
//数据包分片
if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
}
3.4.2、 ipv4_conntrack_in
这个函数主要是主要是初始化一条链接、更新链接状态
static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
3.4.2.1、nf_conntrack_in
unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
struct nf_conn *ct, *tmpl = NULL;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
unsigned int *timeouts;
unsigned int dataoff;
u_int8_t protonum;
int set_reply = 0;
int ret;
/*nfct不为NULL说明已经建立连接跟踪选项*/
if (skb->nfct) {
/* Previously seen (loopback or untracked)? Ignore. */
tmpl = (struct nf_conn *)skb->nfct;
if (!nf_ct_is_template(tmpl)) {
NF_CT_STAT_INC_ATOMIC(net, ignore);
return NF_ACCEPT;
}
skb->nfct = NULL;
}
/* rcu_read_lock()ed by nf_hook_slow */
/*根据三层协议号在nf_ct_l3protos数组中寻找三层struct nf_conntrack_l3proto实例*/
l3proto = __nf_ct_l3proto_find(pf);
/*获取四层协议号*/
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
&dataoff, &protonum);
if (ret <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
ret = -ret;
goto out;
}
/*根据三层协议号、四层协议号获取四层struct nf_conntrack_l4proto实例*/
l4proto = __nf_ct_l4proto_find(pf, protonum);
/* It may be an special packet, error, unclean...
* inverse of the return code tells to the netfilter
* core what to do with the packet. */
if (l4proto->error != NULL) {
ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
pf, hooknum);
if (ret <= 0) {
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
ret = -ret;
goto out;
}
/* ICMP[v6] protocol trackers may assign one conntrack. */
if (skb->nfct)
goto out;
}
/*从tuple hash表中获取struct nf_conn结构体和reply方向数据标志*/
ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
NF_CT_STAT_INC_ATOMIC(net, invalid);
ret = NF_ACCEPT;
goto out;
}
if (IS_ERR(ct)) {
/* Too stressed to deal. */
NF_CT_STAT_INC_ATOMIC(net, drop);
ret = NF_DROP;
goto out;
}
NF_CT_ASSERT(skb->nfct);
/* Decide what timeout policy we want to apply to this flow. */
timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
/*填充tuple结构中四层的元素*/
ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
nf_conntrack_put(skb->nfct);
skb->nfct = NULL;
NF_CT_STAT_INC_ATOMIC(net, invalid);
if (ret == -NF_DROP)
NF_CT_STAT_INC_ATOMIC(net, drop);
ret = -ret;
goto out;
}
/*当在reply方向收到数据包后设置链接状态为IPS_SEEN_REPLY_BIT
状态改变调用nf_conntrack_event_cache ,由nfnetlink模块处理状态改变的事件*/
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
if (tmpl) {
/* Special case: we have to repeat this hook, assign the
* template again to this packet. We assume that this packet
* has no conntrack assigned. This is used by nf_ct_tcp. */
if (ret == NF_REPEAT)
skb->nfct = (struct nf_conntrack *)tmpl;
else
nf_ct_put(tmpl);
}
return ret;
}
3.4.2.2、nf_ct_is_template
判断不是IPS_TEMPLATE_BIT
static inline int nf_ct_is_template(const struct nf_conn *ct)
{
return test_bit(IPS_TEMPLATE_BIT, &ct->status);
}
3.4.2.3、resolve_normal_ct
这个函数主要是判断连接跟踪是否存在,不存在就去创建。然后设置连接的状态
/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
static inline struct nf_conn * resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
int *set_reply,
enum ip_conntrack_info *ctinfo)
{
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_zone tmp;
struct nf_conn *ct;
u32 hash;
//获取tuple
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
pr_debug("resolve_normal_ct: Can't get tuple\n");
return NULL;
}
/* look for tuple match */
//hash表中查找tuple
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
hash = hash_conntrack_raw(&tuple);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
//没有找到就新建一个tuple
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
skb, dataoff, hash);
if (!h)
return NULL;
if (IS_ERR(h))
return (void *)h;
}
//根据tuple得到连接跟踪结构体nf_conn
//根据nf_conntrack_tuple_hash{}结构体中tuplehash[IP_CT_DIR_ORIGINAL]成员的地址,反过来计算其所在的结构体nf_conn{}对象的首地址
ct = nf_ct_tuplehash_to_ctrack(h);
/* It exists; we have (non-exclusive) reference. */
/*数据包是reply方向表名连接双向已经建立
设置数据包的状态为IP_CT_ESTABLISHED + IP_CT_IS_REPLY*/
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
*ctinfo = IP_CT_ESTABLISHED_REPLY;
/* Please set reply bit if this packet OK */
*set_reply = 1;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
/*数据包是orig方向,以及收到reply方向的数据则设置数据包状态为IP_CT_ESTABLISHED*/
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
pr_debug("nf_conntrack_in:normal packet for %pK\n", ct);
//两个方向都已经建立了
*ctinfo = IP_CT_ESTABLISHED;
/*还没有收到reply方向数据包,是一个期望连接设置数据包状态为IP_CT_RELATED*/
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
pr_debug("nf_conntrack_in: related packet for %pK\n",
ct);
*ctinfo = IP_CT_RELATED;
} else {
pr_debug("nf_conntrack_in: new packet for %pK\n", ct);
/*没有收到relply方向的数据包,而且不是期望连接设置数据包状态为IP_CT_NEW*/
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
}
skb->nfct = &ct->ct_general;
skb->nfctinfo = *ctinfo;
return ct;
}
3.4.2.4、nf_ct_get_tuple
主要根据协议号调用pkt_to_tuple生成一个tuple,tcp/udp协议就是生成五元组(源ip、目的ip、源端口、目的端口、协议号),icmp协议就是(id、code、type)
bool nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int nhoff,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
struct net *net,
struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(tuple, 0, sizeof(*tuple));
tuple->src.l3num = l3num;
/*三层协议从skb中获取源ip、目的ip保存到tuple*/
if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
return false;
tuple->dst.protonum = protonum;
/*方向orig*/
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
/*四层协议tcp/udp后去源端口、目的端口保存到tuple 如果是icmp就获取type、code、id*/
return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
}
3.4.2.5、__nf_conntrack_find_get
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
rcu_read_lock();
begin:
/*查找tuple*/
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (unlikely(nf_ct_is_dying(ct) ||
!atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
nf_ct_put(ct);
goto begin;
}
}
}
rcu_read_unlock();
return h;
}
3.4.2.6、____nf_conntrack_find
static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int bucket = hash_bucket(hash, net);
/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
*/
local_bh_disable();
begin:
/*遍历链表查找tuple*/
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
if (nf_ct_key_equal(h, tuple, zone)) {
NF_CT_STAT_INC(net, found);
local_bh_enable();
return h;
}
NF_CT_STAT_INC(net, searched);
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(n) != bucket) {
NF_CT_STAT_INC(net, search_restart);
goto begin;
}
local_bh_enable();
return NULL;
}
3.4.2.7、init_conntrack
static struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
unsigned int *timeouts;
/*tuplehash的reply方向的tuple赋值,起始就是orig方向的反过来*/
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
pr_debug("Can't invert tuple.\n");
return NULL;
}
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/*分配一个nf_conn结构体*/
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
if (tmpl && nfct_synproxy(tmpl)) {
nfct_seqadj_ext_add(ct);
nfct_synproxy_ext_add(ct);
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
if (timeout_ext) {
timeouts = nf_ct_timeout_data(timeout_ext);
if (unlikely(!timeouts))
timeouts = l4proto->get_timeouts(net);
} else {
timeouts = l4proto->get_timeouts(net);
}
/*对nf_conn进行四层协议的初始化*/
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
pr_debug("init conntrack: can't track with proto module\n");
return NULL;
}
if (timeout_ext)
nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
GFP_ATOMIC);
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
nf_ct_dscpremark_ext_add(ct, GFP_ATOMIC);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
ecache ? ecache->expmask : 0,
GFP_ATOMIC);
local_bh_disable();
if (net->ct.expect_count) {
spin_lock(&nf_conntrack_expect_lock);
/*查找是否是已建立连接的期望连接*/
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
pr_debug("conntrack: expectation arrives ct=%pK exp=%pK\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
/*如果是期望连接设置IPS_EXPECTED_BIT标志位并且给ct->master赋值期望*/
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
ct->master = exp->master;
if (exp->helper) {
help = nf_ct_helper_ext_add(ct, exp->helper,
GFP_ATOMIC);
if (help)
rcu_assign_pointer(help->helper, exp->helper);
}
#ifdef CONFIG_NF_CONNTRACK_MARK
ct->mark = exp->master->mark;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
ct->secmark = exp->master->secmark;
#endif
NF_CT_STAT_INC(net, expect_new);
}
spin_unlock(&nf_conntrack_expect_lock);
}
if (!exp) {
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
NF_CT_STAT_INC(net, new);
}
/* Now it is inserted into the unconfirmed list, bump refcount */
nf_conntrack_get(&ct->ct_general);
nf_ct_add_to_unconfirmed_list(ct);
local_bh_enable();
if (exp) {
if (exp->expectfn)
exp->expectfn(ct, exp);
nf_ct_expect_put(exp);
}
return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}
3.4.2.8、nf_ct_invert_tuple
bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
/*三层reply方向的初始化*/
if (l3proto->invert_tuple(inverse, orig) == 0)
return false;
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
/*四层reply方向的tuple初始化*/
return l4proto->invert_tuple(inverse, orig);
}
3.4.2.9、__nf_conntrack_alloc
static struct nf_conn *
__nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
{
struct nf_conn *ct;
if (unlikely(!nf_conntrack_hash_rnd)) {
init_nf_conntrack_hash_rnd();
/* recompute the hash as nf_conntrack_hash_rnd is initialized */
hash = hash_conntrack_raw(orig);
}
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
/*连接跟踪数量已经超过最大值nf_conntrack_max
根据tuple算出hash值,对于连接跟踪项的status的
IPS_ASSURED_BIT位没有被置位的连接跟踪项,则强制删除。*/
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count);
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM);
}
}
/*
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
/*为struct nf_conn分配空间*/
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
offsetof(struct nf_conn, __nfct_init_offset[0]));
if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
goto out_free;
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
}
3.4.2.10、nf_ct_tuplehash_to_ctrack
static inline struct nf_conn *
nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
{
return container_of(hash, struct nf_conn,
tuplehash[hash->tuple.dst.dir]);
}
3.4.3、ipv4_conntrack_local
static unsigned int ipv4_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
3.4.4、ipv4_helper
static unsigned int ipv4_helper(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
const struct nf_conn_help *help;
const struct nf_conntrack_helper *helper;
/* This is where we call the helper: as the packet goes out. */
//获取连接跟踪数据,没有建立返回null
ct = nf_ct_get(skb, &ctinfo);
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
return NF_ACCEPT;
help = nfct_help(ct);
if (!help)
return NF_ACCEPT;
/* rcu_read_lock()ed by nf_hook_slow */
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
ct, ctinfo);
}
3.4.5、 nf_conntrack_confirm
确认前面通过 nf_conntrack_in() 创建的新连接(是否被丢弃),将元组从未确认tuplehash列表中删除
/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int nf_conntrack_confirm(struct sk_buff *skb)
{
//skb所属连接的连接跟踪模块
struct nf_conn *ct = (struct nf_conn *)skb->nfct;
int ret = NF_ACCEPT;
if (ct && !nf_ct_is_untracked(ct)) {
//检查该连接是否已经被确认过
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
if (likely(ret == NF_ACCEPT))
//向外部模块发送缓存的事件
nf_ct_deliver_cached_events(ct);
}
return ret;
}
3.4.5.1 __nf_conntrack_confirm
int __nf_conntrack_confirm(struct sk_buff *skb)
{
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
/* ipt_REJECT uses nf_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
which created connection will be IP_CT_NEW or for an
expected connection, IP_CT_RELATED. */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = hash_bucket(hash, net);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
* REJECT will give spurious warnings here.
*/
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
/* No external references means no one else could have
* confirmed us.
*/
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
pr_debug("Confirming conntrack %pK\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
if (unlikely(nf_ct_is_dying(ct)))
goto out;
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple) &&
nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
NF_CT_DIRECTION(h)))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
NF_CT_DIRECTION(h)))
goto out;
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout.expires += jiffies;
//激活超时定时器
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
ct->status |= IPS_CONFIRMED;
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp) {
if (skb->tstamp.tv64 == 0)
__net_timestamp(skb);
tstamp->start = ktime_to_ns(skb->tstamp);
}
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;
out:
nf_ct_add_to_dying_list(ct);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return NF_DROP;
}
3.5、连接超时机制
3.5.1、定时器的初始化
struct nf_conn *nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp)
{
return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
}
static struct nf_conn *
__nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
{
struct nf_conn *ct;
if (unlikely(!nf_conntrack_hash_rnd)) {
init_nf_conntrack_hash_rnd();
/* recompute the hash as nf_conntrack_hash_rnd is initialized */
hash = hash_conntrack_raw(orig);
}
/* We don't want any race condition at early drop stage */
atomic_inc(&net->ct.count);
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count);
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM);
}
}
/*
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
if (ct == NULL)
goto out;
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
/* Don't set timer yet: wait for confirmation */
//定时器的初始化
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
offsetof(struct nf_conn, __nfct_init_offset[0]));
if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
goto out_free;
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
atomic_set(&ct->ct_general.use, 0);
return ct;
out_free:
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
out:
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
}
3.5.2、定时器超时回调
static void death_by_timeout(unsigned long ul_conntrack)
{
nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
}
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
tstamp = nf_conn_tstamp_find(ct);
if (tstamp && tstamp->stop == 0)
tstamp->stop = ktime_get_real_ns();
if (nf_ct_is_dying(ct))
goto delete;
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
/* destroy event was not delivered */
nf_ct_delete_from_lists(ct);
nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
return false;
}
nf_conntrack_ecache_work(nf_ct_net(ct));
set_bit(IPS_DYING_BIT, &ct->status);
delete:
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
}
3.5.3、连接删除
static void clean_from_lists(struct nf_conn *ct)
{
pr_debug("clean_from_lists(%pK)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
/* Destroy all pending expectations */
//销毁所有的期望连接
nf_ct_remove_expectations(ct);
}
3.5.4、定时器的更新
当收到数据包后,也应该更新该定时器,防止其超时,这是通过调用
static int udp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
unsigned int hooknum,
unsigned int *timeouts)
{
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_REPLIED]);
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
static inline void nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
unsigned long extra_jiffies)
{
__nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, 1);
}
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
unsigned long extra_jiffies,
int do_acct)
{
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);
/* Only update if this is not a fixed timeout */
//设定了该标记的连接的超时值将无法被更新
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;
/* If not in hash table, timer will not be active yet */
if (!nf_ct_is_confirmed(ct)) {
//连接跟踪信息快还没被确认时,该定时器就还没有被激活,此时重新设定超时时间,认为是重新更新超时时间戳
ct->timeout.expires = extra_jiffies;
} else {
//传入超时值是当前时间的相对值
unsigned long newtime = jiffies + extra_jiffies;
/* Only update the timeout if the new timeout is at least
HZ jiffies from the old timeout. Need del_timer for race
avoidance (may already be dying). */
//只有当新的超时值至少超过单管超时值1S时才重新更新定时器
if (newtime - ct->timeout.expires >= HZ)
mod_timer_pending(&ct->timeout, newtime);
}
acct:
//做数据统计
if (do_acct) {
struct nf_conn_acct *acct;
acct = nf_conn_acct_find(ct);
if (acct) {
struct nf_conn_counter *counter = acct->counter;
atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
}
}
}