目录
- 数据接收流程图
- 硬件层
- 网络层
- ip_rcv
- ip_rcv_core
- ip_rcv_finish 和 ip_rcv_finish_core
- ip_local_deliver
- ip_local_deliver_finish 和 ip_protocol_deliver_rcu
- 传输层
- tcp_v4_rcv
- tcp_v4_do_rcv
- tcp_rcv_state_process
- tcp_rcv_established
- tcp_recvmsg
- 数据结构
- socket
- sock
- sock_common
- sk_buff
linux内核源码下载:https://cdn.kernel.org/pub/linux/kernel/
我下载的是:linux-5.11.1.tar.gz
数据接收流程图
1,一般网卡接收数据是以触发中断来接收的,在网卡driver中,接收到数据时,往kernel的api:netif_rx()丢。
2,接着数据被送到IP层ip_local_deliver_finish(),经过剥离ip头部,把数据往TCP层发。
3,tcp层tcp_v4_rcv()收到数据后,再调用tcp_rcv_established()(ack也是其中的tcp_ack()发送的)处理,如果当前用户进程没有正在读取数据,则将其插入到接收队列中,tcp_queue_rcv()则判断接收队列是否为空,如果不为空,则将skb合并到接收队列的尾部,最后由tcp_recvmsg()从接收列队中一个一个的将skb读取到用户设置的buffer中去。
4,上层通过recvmsg等函数去接收数据时,就是从buffer中去读取的。
调用栈:
1、从下往上:网络设备接口-> Netif_rx -> ip_rcv -> ip_rcv_finished -> ip_local_deliver -> ip_local_deliver_finished -> tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_established -> tcp_data_queue ->sk_data_ready
2、从上往下:应用层recvfrom -> SYSCALL_DEFINE2 -> __sys_recvfrom -> sock_recvmsg -> sock_recvmsg_nosec -> inet_recvmsg -> tcp_recvmsg -> 读取队列
硬件层
netif_rx()
所在目录:/linux-5.11.1/net/core/dev.c。
/**
* netif_rx - post buffer to the network code 把接收数据投递到网络层代码
* @skb: buffer to post 要投递的数据buf
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
* 此函数从设备驱动程序接收数据包,并将其排队到上层(协议)级别处理。
* 它总是成功的。缓冲区可以在拥塞控制处理期间或通过协议层被丢弃。
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
int ret;
trace_netif_rx_entry(skb);
ret = netif_rx_internal(skb);//把网卡接收的skb存储起来
trace_netif_rx_exit(ret);
return ret;
}
该函数在网卡驱动程序与linux内核之间建立了一道桥梁,将网卡接收上来的数据包(sk_buff形式)插入内核维护的接收缓冲区队列当中。
他的主要任务是把数据帧添加到CPU的输入队列input_pkt_queue中。随后标记软中断来处理后续上传数据帧给TCP/IP协议栈。
网络层
所在目录:/linux-5.11.1/net/ipv4/ip_input.c。
ip_local_deliver(解析IP头部,组包)
ip_local_deliver_finish(根据IP报头的protocol字段,找到对应的L4协议,TCP/UDP)。
ip_rcv
/* from ip_input.c
* IP receive entry point
* IP接收入口函数
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
struct net *net = dev_net(dev);
skb = ip_rcv_core(skb, net);//IP层主要处理函数
if (skb == NULL)
return NET_RX_DROP;
//钩子函数,通过iptables的PRE_ROUTING链,如果继续则进入ip_rcv_finish
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
ip_rcv_core
/* from ip_input.c
* IP层接收主处理函数
* Main IP Receive routine.
*/
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
//丢弃PACKET_OTHERHOST类型的包
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
//尝试以最小长度获取ip头,获取不到则返回error
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
//获取ip头
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
//校验ip头的长度和版本
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
__IP_ADD_STATS(net,
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
//尝试以ip头携带的ip头部长度获取ip头,获取失败返回error
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;
//skb的长度,比ip头携带的ip包总长度小,丢弃报文
//ip包总长度,比ip头长度小,返回error
len = ntohs(iph->tot_len);
if (skb->len < len) {
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
iph = ip_hdr(skb);
//获取传输层数据指针
skb->transport_header = skb->network_header + iph->ihl*4;
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
if (!skb_sk_is_prefetched(skb))
skb_orphan(skb);
return skb;
csum_error:
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NULL;
}
ip_rcv_finish 和 ip_rcv_finish_core
ip_input.c
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
//如果数据报属于L3层,则传递给其处理程序进行处理
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
//dst_input实际上会调用skb->dst->input(skb).input函数会根据路由信息设置为合适的
//函数指针,如果是递交到本地的则为ip_local_deliver,若是转发则为ip_forward.
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
}
ip_input.c
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
if (ip_can_use_hint(skb, iph, hint)) {
err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
dev, hint);
if (unlikely(err))
goto drop_error;
}
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
udp_v4_early_demux, skb);
if (unlikely(err))
goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
* 为数据包初始化虚拟路径缓存,它描述了数据包是如何在linux网络中传播的
*/
//通常从外界接收的数据包,skb->dst不会包含路由信息,暂时还不知道在何处会设置这个字段
//ip_route_input函数会根据路由表设置路由信息
if (!skb_valid_dst(skb)) {
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
}
//更新统计数据
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
//如果IP头部大于20字节,则表示IP头部包含IP选项,需要进行选项处理
if (iph->ihl > 5 && ip_rcv_options(skb, dev))
goto drop;
//skb_rtable函数等同于skb_dst函数,获取skb->dst
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
* When a host sends a datagram to a link-layer broadcast
* address, the IP destination address MUST be a legal IP
* broadcast or IP multicast address.
*
* A host SHOULD silently discard a datagram that is received
* via a link-layer broadcast (see Section 2.4) but does not
* specify an IP multicast or broadcast destination address.
*
* This doesn't explicitly say L2 *broadcast*, but broadcast is
* in a way a form of multicast and the most common use case for
* this is 802.11 protecting against cross-station spoofing (the
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
goto drop;
}
return NET_RX_SUCCESS;
drop:
kfree_skb(skb);
return NET_RX_DROP;
drop_error:
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
ip_local_deliver
ip_input.c
/* 将IP数据包传递到更高的协议层。
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
* 分片重组
*/
struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
/* 经过LOCAL_IN钩子点 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
ip_local_deliver_finish 和 ip_protocol_deliver_rcu
ip_input.c
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
/* 去掉ip头 */
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();//获取锁
ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
}
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
const struct net_protocol *ipprot;
int raw, ret;
resubmit:
/* 原始套接口,复制一个副本,输出到该套接口 */
raw = raw_local_deliver(skb, protocol);
/* 获取协议处理结构 */
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
return;
}
nf_reset_ct(skb);
}
//注册tcp和udp接收函数
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {/* 原始套接口未接收或接收异常 */
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
/* 发送icmp */
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
/* 原始套接口接收 */
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
/* 释放包 */
consume_skb(skb);
}
}
}
传输层
根据IP报头协议,以TCP为例,TCP接收函数为:int tcp_v4_rcv(struct sk_buff *skb)。
所在目录:/linux-5.11.1/net/ipv4/tcp_ipv4.c。
tcp_v4_rcv
tcp_input.c
/* TCP 接收数据的入口
* From tcp_input.c
* skb:从 IP 层传递过来的数据报
*/
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
int ret;
/* 如果不是发往本机的就直接丢弃 */
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
__TCP_INC_STATS(net, TCP_MIB_INSEGS);
/*
如果 TCP 段在传输过程中被分片了,则到达本地后会在 IP 层重新组装。
组装完成后,报文分片都存储在链表中。在此,需把存储在分片中的报文
复制到 SKB 的线性存储区域中。如果发生异常,则丢弃该报文。
*/
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = (const struct tcphdr *)skb->data;//获得tcp头
/*
如果 TCP 的首部长度小于不带数据的 TCP 的首部长度,则说明 TCP 数据异常。
统计相关信息后,丢弃。
*/
if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
goto bad_packet;
//保证skb的线性区域至少包括实际的TCP首部
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
//验证 TCP 首部中的校验和,如校验和有误,则说明报文已损坏,统计相关信息后丢弃。
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
//初始化skb中的控制块
th = (const struct tcphdr *)skb->data;//tcp头
iph = ip_hdr(skb);//IP头
/* 在 ehash 或 bhash 散列表中根据地址和端口来查找传
输控制块。如果在 ehash 中找到,则表示已经经历了三次握手并且已建立了连接,可以
进行正常的通信。如果在 bhash 中找到,则表示已经绑定已经绑定了端口,处于侦听
状态。如果在两个散列表中都查找不到,说明此时对应的传输控制块还没有创建,跳转
到no_tcp_socket 处处理。
*/
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
process:
//TCP_TIME_WAIT需要做特殊处理,这里先不关注
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen = false;
struct sock *nsk;
sk = req->rsk_listener;
if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
if (tcp_checksum_complete(skb)) {
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
/* We own a reference on the listener, increase it again
* as we might lose it too soon.
*/
sock_hold(sk);
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
}
if (!nsk) {
reqsk_put(req);
if (req_stolen) {
/* Another cpu got exclusive access to req
* and created a full blown socket.
* Try to feed this packet to this socket
* instead of discarding it.
*/
tcp_v4_restore_cb(skb);
sock_put(sk);
goto lookup;
}
goto discard_and_relse;
}
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
} else {
sock_put(sk);
return 0;
}
}
/*ttl 小于给定的最小的 ttl*/
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
//查找 IPsec 数据库,如果查找失败,进行相应处理.
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
//md5 相关
if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
nf_reset_ct(skb);
//TCP套接字过滤器,如果数据包被过滤掉了,结束处理过程 -> sk_filter_trim_cap
if (tcp_filter(sk, skb))
goto discard_and_relse;
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
//到了传输层,该字段已经没有意义,将其置为空
skb->dev = NULL;
/*LISTEN 状态 */
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v4_do_rcv(sk, skb);//交由tcp_v4_do_rcv()处理
goto put_and_return;
}
sk_incoming_cpu_update(sk);
//先持锁,这样进程上下文和其它软中断则无法操作该TCB
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
//如果当前TCB没有被进程上下文锁定,首先尝试将数据包放入prequeue队列,
//如果prequeue队列没有处理,再将其处理后放入receive队列。如果TCB已
//经被进程上下文锁定,那么直接将数据包放入backlog队列
if (!sock_owned_by_user(sk)) {
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))//TCB被用户进程锁定,直接将数据包放入backlog队列
goto discard_and_relse;
skb_to_free = NULL;
}
bh_unlock_sock(sk);//释放锁
if (skb_to_free)
__kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);//释放TCB引用计数,当计数为 0 的时候,使用 sk\_free 释放传输控制块
return ret;//返回处理结果
//处理没有创建传输控制块收到报文,校验错误,坏包的情况,给对端发送 RST 报文。
no_tcp_socket:
/* 查找 IPsec 数据库,如果查找失败,进行相应处理.*/
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
csum_error:
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
discard_it:
/* Discard frame. */
//丢弃帧
kfree_skb(skb);
return 0;
discard_and_relse:
sk_drops_add(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
//处理TIME_WAIT状态
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
/*inet_timewait_sock 减少引用计数 */
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
//根据返回值进行相应处理
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo, skb,
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
goto process;
}
}
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
tcp_v4_do_rcv
所在目录:/linux-5.11.1/net/ipv4/tcp_ipv4.c。
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
* socket必须有自旋锁,除非是TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
* 我们这里有一个潜在的双锁情况,所以即使在做积压处理时,我们也使用BH锁定方案。这是因为我们不能用原来的自旋锁来睡觉。
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
/* 当状态为ESTABLISHED时,用tcp_rcv_established()接收处理 */
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
/* 连接已建立时的处理路径 */
tcp_rcv_established(sk, skb);
return 0;
}
if (tcp_checksum_complete(skb))
goto csum_err;
/* 如果这个sock处于监听状态,被动打开时的处理,包括收到SYN或ACK */
if (sk->sk_state == TCP_LISTEN) {
/*
NULL,错误
nsk == sk,接收到 SYN
nsk != sk,接收到 ACK
*/
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) { /* 处理新的sock ,初始化子传输控制块*/
rsk = nsk;
goto reset;//失败时,给客户端发送 RST 段进行复位
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
/* 处理除了ESTABLISHED和TIME_WAIT之外的所有状态 */
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
reset:
tcp_v4_send_reset(rsk, skb); /* 发送被动的RST包 */
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
tcp_rcv_state_process
tcp_input.c
tcp_rcv_state_process实现了 TCP 状态机相对核心的一个部分。该函数可以处理除 ESTABLISHED 和 TIME_WAIT 状态以外的情况下的接收过程。
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
* 该函数实现了除已建立状态和TIME_WAIT外的所有状态的RFC 793的接收过程。
* 它从tcp_v4_rcv和tcp_v6_rcv都调用,应该是独立的地址。
*sk: 传输控制块
*skb:缓存块
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
switch (sk->sk_state) {
case TCP_CLOSE:/* CLOSE 状态的处理代码 */
goto discard;
case TCP_LISTEN:/* LISTEN 状态的处理代码 */
if (th->ack)
return 1;
if (th->rst)
goto discard;
if (th->syn) {
if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
rcu_read_unlock();
if (!acceptable)
return 1;
consume_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT://客户端发送SYN第一次握手,等待服务端回复SYN+ACK第二次握手
tp->rx_opt.saw_tstamp = 0;
tcp_mstamp_refresh(tp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
}
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
req = rcu_dereference_protected(tp->fastopen_rsk,
lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
if (!tcp_check_req(sk, skb, req, true, &req_stolen))
goto discard;
}
if (!th->ack && !th->rst && !th->syn)
goto discard;
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
/* step 5: check the ACK field */
//对收到的 ACK 段进行处理判断是否正确接收,如果正确接收就会发送返回非零值。
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
tcp_send_challenge_ack(sk, skb);
goto discard;
}
switch (sk->sk_state) {
case TCP_SYN_RECV://服务端发送SYN+ACK第二次握手,等待客户端回复ACK第三次握手
tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
if (req) {
tcp_rcv_synrecv_state_fastopen(sk);
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
//进行一系列的初始化,开启相应拥塞控制等,并且将 TCP 的状态置为 TCP_ESTABLISHED。
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
//发信号给那些将通过该套接口发送数据的进程,通知它们套接口目前已经可以发送数据了
if (sk->sk_socket)
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
//初始化传输控制块的各个字段,对时间戳进行处理。
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_jiffies32;
//更新最近一次的发送数据报的时间,初始化与路径 MTU 有关的成员,并计算有关TCP 首部预测的标志
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {//发送SIN+ACK第一次挥手后,等待对方回复ACK第二次挥手
int tmo;
if (req)
tcp_rcv_synrecv_state_fastopen(sk);
if (tp->snd_una != tp->write_seq)
break;
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
sk->sk_state_change(sk);
break;
}
if (tp->linger2 < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
/* Receive out of order FIN after close() */
if (tp->syn_fastopen && th->fin)
tcp_fastopen_active_disable(sk);
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
break;
}
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break;
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
}
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
if (sk_is_mptcp(sk))
mptcp_incoming_options(sk, skb);
break;
}
fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk, skb);
return 1;
}
}
fallthrough;
case TCP_ESTABLISHED://对已接收到的 TCP 段排队,在建立连接阶段一般不会收到 TCP 段
tcp_data_queue(sk, skb);
queued = 1;
break;
}
/* tcp_data could move socket to TIME-WAIT */
//此时状态不为 CLOSE,故而就回去检测是否数据和 ACK 要发送。
//其次,根据 queue 标志来确定是否释放接收到的 TCP 段,如果接收到的 TCP 段已添加到接收队列中,则不释放
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
tcp_drop(sk, skb);
}
return 0;
}
tcp_rcv_established
1、状态为ESTABLISHED时,用tcp_rcv_established()接收处理。
2. 状态为LISTEN时,说明这个sock处于监听状态,用于被动打开的接收处理,包括SYN和ACK。
3. 当状态不为ESTABLISHED或TIME_WAIT时,用tcp_rcv_state_process()处理。
所在目录:/linux-5.11.1/net/ipv4/tcp_input.c。
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
int res;
/*
* Header prediction.
* The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
//预定向标志和输入数据段的标志比较
//数据段序列号是否正确
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
//时间戳选项之外如果还有别的选项就送给Slow Path处理
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
//对数据包做PAWS快速检查,如果检查走Slow Path处理
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
//数据包长度太小
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
} else { /* Header too small */
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
int copied_early = 0;
//tp->copied_seq表示未读的数据包序列号
//tp->rcv_nxt表示下一个期望读取的数据包序列号
//len-tcp_header_len小于tp->ucpoy.len表示数据包还没有复制完
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
#ifdef CONFIG_NET_DMA
if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
copied_early = 1;
eaten = 1;
}
#endif
//当前进程是否有锁定
//当前进程的全局指针current
//tp->ucopy.task指针是否等于当前进程
if (tp->ucopy.task == current &&
sock_owned_by_user(sk) && !copied_early) {
__set_current_state(TASK_RUNNING);
//将数据包复制到应用层空间
if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
eaten = 1;
}
//复制成功
if (eaten) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) +
TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
}
//清除prequeue队列中已经复制的数据包,并回复ack
if (copied_early)
tcp_cleanup_rbuf(sk, skb->len);
}
//复制不成功
if (!eaten) {
//从新计算校验和
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
//大块数据传送
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
//去掉tcp头部
__skb_pull(skb, tcp_header_len);
//将数据包加入到sk_receive_queue队列中
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
}
//更新延迟回答时钟超时间隔值
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
//收到数据后回复ack确认
if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
__tcp_ack_snd_check(sk, 0);
no_ack:
#ifdef CONFIG_NET_DMA
if (copied_early)
__skb_queue_tail(&sk->sk_async_wait_queue, skb);
else
#endif
if (eaten)
__kfree_skb(skb);
else
//no_ack标签表明套接字已经准备好下一次应用读
sk->sk_data_ready(sk, 0);
return 0;
}
}
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
/*
* Standard slow path.
*/
res = tcp_validate_incoming(sk, skb, th, 1);
if (res <= 0)
return -res;
step5:
if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
//紧急数据段处理
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
//根据情况将数据复制到应用层或者
//将数据加入sk_receive_queue常规队列中
tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return 0;
csum_error:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
}
tcp_recvmsg
用户进程调用recvfrom读取套接字缓冲区上的数据,实际是调用tcp_recvmsg函数将数据包从内核地址空间复制到用户考地址空间。
所在目录:/linux-5.11.1/net/ipv4/tcp.c。
int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
//获取TCP套接字结构
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
u32 peek_seq;
u32 *seq;
unsigned long used;
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = 0;
struct sk_buff *skb;
u32 urg_hole = 0;
//锁住套接字,其实就是设置sk->sk_lock.owned = 1
//当产生软中断调用tcp_v4_rcv获取套接字sock发现
//sock处于进程上下文,就会把数据包加入到balock_queue队列中
lock_sock(sk);
TCP_CHECK_TIMER(sk);
err = -ENOTCONN;
//套接字当前处于监听状态就直接跳出
if (sk->sk_state == TCP_LISTEN)
goto out;
//查实时间,如果是非阻塞模式就为0
timeo = sock_rcvtimeo(sk, nonblock);
//紧急处理数据
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg;
//未读取数据包的开始序列号
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
seq = &peek_seq;
}
//取len和sk->rcvlowat中的最小值
//MSG_WAITALL标志是判断是否要接受完整的数据包后再拷贝复制数据包
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
//配置了DMA可以直接通过访问内存复制数据到用户地址空间
#ifdef CONFIG_NET_DMA
tp->ucopy.dma_chan = NULL;
preempt_disable();
skb = skb_peek_tail(&sk->sk_receive_queue);
{
int available = 0;
if (skb)
available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
if ((available < target) &&
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
dma_find_channel(DMA_MEMCPY)) {
preempt_enable_no_resched();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
} else {
preempt_enable_no_resched();
}
}
#endif
//主循环,复制数据到用户地址空间直到target为0
do {
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
//遇到紧急数据停止处理跳出循环
if (tp->urg_data && tp->urg_seq == *seq) {
if (copied)
break;
//检测套接字上是否有信号等待处理,确保能处理SIGUSR信号。
if (signal_pending(current)) {
//检查是否超时
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
}
/* Next get a buffer. */
//循环变量接受缓冲区队列receive_queue队列
skb_queue_walk(&sk->sk_receive_queue, skb) {
/* Now that we have two receive queues this
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
KERN_INFO "recvmsg bug: copied %X "
"seq %X rcvnxt %X fl %X\n", *seq,
TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break;
//未读取数据包的序列号和已经读取数据包的序列号差
//如果这个差小于数据包长度skb->len,表示这是我们要找的数据包
//因为是最小的序列号
offset = *seq - TCP_SKB_CB(skb)->seq;
//如果是syn表就跳过
if (tcp_hdr(skb)->syn)
offset--;
//找到了skb,跳转到found_ok_skb处完成复制工作
if (offset < skb->len)
goto found_ok_skb;
//发现是fin包调转到fin处理标签处
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
"copied %X seq %X rcvnxt %X fl %X\n",
*seq, TCP_SKB_CB(skb)->seq,
tp->rcv_nxt, flags);
}
/* Well, if we have backlog, try to process it now yet. */
//缓冲区recieve_queue队列中已经没有数据
//而且backlog_queue队列中也没有数据了就跳出循环
if (copied >= target && !sk->sk_backlog.tail)
break;
if (copied) {
//检查套接字的状态是否是关闭
//或者收到远端的断开请求,则要跳出复制循环
if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current))
break;
} else {
//copied为0表示应用层没有复制到数据,没有复制到数据有三种可能
//第一是套接字已经关闭了,第二是缓冲区根本没有数据
//第三是其他错误
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
//当用户关闭套接字会设置SOCK_DON标志
//连接状态是TCP_CLOSE,SOCK_DONE标志就不会0
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
break;
}
//查看是否阻塞,不阻塞直接返回
//返回的错误标志是EAGAIN
if (!timeo) {
copied = -EAGAIN;
break;
}
//读取数据失败可能是其他错误
//返回错误原因
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
//根据已经复制数据长度copied清除recieve_queue队列
//并且回复对端ack包
tcp_cleanup_rbuf(sk, copied);
//sk_recieve_queue队列中已无数据需要处理就处理preueue队列上的数据
//prequeue队列的处理现场是用户进程
if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
//复制pre_queue队列的进程是当前进程
user_recv = current;
//处理数据的用户进程
tp->ucopy.task = user_recv;
//应用层接受数据的缓冲区地址
tp->ucopy.iov = msg->msg_iov;
}
//拷贝数据长度
tp->ucopy.len = len;
WARN_ON(tp->copied_seq != tp->rcv_nxt &&
!(flags & (MSG_PEEK | MSG_TRUNC)));
//prequeu队列不为空,必须在释放套接字之前处理这些数据包
//如果这个处理没有完成则数据段顺序将会被破坏,接受段处理顺序是
//flight中的数据、backlog队列、prequeue队列、sk_receive_queue队列,只有当前队列处理
//完成了才会去处理下一个队列。prequeue队列可能在循环结束套接字释放前又
//加入数据包,调转到do_prequeue标签处理
if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
}
#ifdef CONFIG_NET_DMA
if (tp->ucopy.dma_chan)
dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
#endif
//数据包复制完毕
if (copied >= target) {
/* Do not sleep, just process backlog. */
//从backlog队列中复制数据包到sk_receive_queue队列
release_sock(sk);
lock_sock(sk);
} else
//已经没有数据要处理,将套接字放入等待状态,进程进入睡眠
//如果有数据段来了tcp_prequeue会唤醒进程,软中断会判断用户进程睡眠
//如果睡眠就会把数据放到prequeue队列中
sk_wait_data(sk, &timeo);
#ifdef CONFIG_NET_DMA
tcp_service_net_dma(sk, false); /* Don't block */
tp->ucopy.wakeup = 0;
#endif
if (user_recv) {
int chunk;
/* __ Restore normal policy in scheduler __ */
if ((chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
//更新剩余数据长度
len -= chunk;
//更新已经复制的数据长度
copied += chunk;
}
//tp->rcv_nxt == tp->copied_seq判断receive_queue队列中释放有数据
if (tp->rcv_nxt == tp->copied_seq &&
!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
//处理prequeue队列
tcp_prequeue_process(sk);
if ((chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
//更新剩余需要复制数据长度
len -= chunk;
//更新复制的数据copied
copied += chunk;
}
}
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
if (net_ratelimit())
printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
current->comm, task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
//处理sk_receive_queue队列中的数据
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
//首先查看是否有紧急数据需要处理
//如果设置套接字选项设置了SO_OOBINLINE就不需要处理紧急数据
//因为有单独处理
if (tp->urg_data) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
++*seq;
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
} else
used = urg_offset;
}
}
if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMA
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
if (tp->ucopy.dma_chan) {
tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
tp->ucopy.dma_chan, skb, offset,
msg->msg_iov, used,
tp->ucopy.pinned_list);
if (tp->ucopy.dma_cookie < 0) {
printk(KERN_ALERT "dma_cookie < 0\n");
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
copied_early = 1;
} else
#endif
{
//将数据包从内核地址空间复制到用户地址空间
err = skb_copy_datagram_iovec(skb, offset,
msg->msg_iov, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
}
}
//更新数据包序列号
*seq += used;
//更新已复制的数据长度
copied += used;
//更新剩下需要复制的数据长度
len -= used;
//重新调整tcp接受窗口
tcp_rcv_space_adjust(sk);
skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
tp->urg_data = 0;
//处理完了紧急数据,调转到Fast Path处理
tcp_fast_path_check(sk);
}
if (used + offset < skb->len)
continue;
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
copied_early = 0;
}
continue;
//套接字状态是Fin
found_fin_ok:
/* Process the FIN. */
//序列号加1
++*seq;
if (!(flags & MSG_PEEK)) {
//重新计算tcp窗口
sk_eat_skb(sk, skb, copied_early);
copied_early = 0;
}
break;
} while (len > 0);
//主循环处理结束后,prequeue队列中还有数据则必须继续处理
if (user_recv) {
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
tcp_prequeue_process(sk);
if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
}
#ifdef CONFIG_NET_DMA
tcp_service_net_dma(sk, true); /* Wait for queue to drain */
tp->ucopy.dma_chan = NULL;
if (tp->ucopy.pinned_list) {
dma_unpin_iovec_pages(tp->ucopy.pinned_list);
tp->ucopy.pinned_list = NULL;
}
#endif
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
TCP_CHECK_TIMER(sk);
release_sock(sk);
if (copied > 0)
uid_stat_tcp_rcv(current_uid(), copied);
return copied;
out:
TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
recv_urg:
//紧急数据处理,复制紧急数据到用户地址空间
err = tcp_recv_urg(sk, msg, len, flags);
if (err > 0)
uid_stat_tcp_rcv(current_uid(), err);
goto out;
}
数据结构
socket
传输层的socket结构体,这个结构体表征BSD套接字的通用特性。
\linux-5.11.1\include\linux\net.h
/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @type: socket type (%SOCK_STREAM, etc)
* @flags: socket flags (%SOCK_NOSPACE, etc)
* @ops: protocol specific socket operations
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wq: wait queue for several uses
*/
struct socket {
socket_state state; //连接状态
short type;//套接字类型,如SOCK_STREAM
unsigned long flags;
struct file *file;//套接字对应的文件指针
struct sock *sk;//网络层的套接字
const struct proto_ops *ops;//协议相关的一系列套接字操作
struct socket_wq wq;//等待队列
};
sock
\linux-5.11.1\include\net\sock.h
sock属于网络层,即IP层,该结构体sock中包含了一个基本结构体sock_common。
struct sock {
struct sock_common __sk_common; // 网络层套接字通用结构体
......
socket_lock_t sk_lock; // 套接字同步锁
atomic_t sk_drops; // IP/UDP包丢包统计
int sk_rcvlowat; // SO_RCVLOWAT标记位
......
struct sk_buff_head sk_receive_queue; // 收到的数据包队列
......
int sk_rcvbuf; // 接收缓存大小
......
union {
struct socket_wq __rcu *sk_wq; // 等待队列
struct socket_wq *sk_wq_raw;
};
......
int sk_sndbuf; // 发送缓存大小
/* ===== cache line for TX ===== */
int sk_wmem_queued; // 传输队列大小
refcount_t sk_wmem_alloc; // 已确认的传输字节数
unsigned long sk_tsq_flags; // TCP Small Queue标记位
union {
struct sk_buff *sk_send_head; // 发送队列对首
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue; // 发送队列
......
u32 sk_pacing_status; /* see enum sk_pacing 发包速率控制状态*/
long sk_sndtimeo; // SO_SNDTIMEO 标记位
struct timer_list sk_timer; // 套接字清空计时器
__u32 sk_priority; // SO_PRIORITY 标记位
......
unsigned long sk_pacing_rate; /* bytes per second 发包速率*/
unsigned long sk_max_pacing_rate; // 最大发包速率
struct page_frag sk_frag; // 缓存页帧
......
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err, // 上次错误
sk_err_soft; // “软”错误:不会导致失败的错误
u32 sk_ack_backlog; // ack队列长度
u32 sk_max_ack_backlog; // 最大ack队列长度
kuid_t sk_uid; // user id
struct pid *sk_peer_pid; // 套接字对应的peer的id
......
long sk_rcvtimeo; // 接收超时
ktime_t sk_stamp; // 时间戳
......
struct socket *sk_socket; // Identd协议报告IO信号
void *sk_user_data; // RPC层私有信息
......
struct sock_cgroup_data sk_cgrp_data; // cgroup数据
struct mem_cgroup *sk_memcg; // 内存cgroup关联
void (*sk_state_change)(struct sock *sk); // 状态变化回调函数
void (*sk_data_ready)(struct sock *sk); // 数据处理回调函数
void (*sk_write_space)(struct sock *sk); // 写空间可用回调函数
void (*sk_error_report)(struct sock *sk); // 错误报告回调函数
int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); // 处理存储区回调函数
......
void (*sk_destruct)(struct sock *sk); // 析构回调函数
struct sock_reuseport __rcu *sk_reuseport_cb; // group容器重用回调函数
......
};
sock_common
\linux-5.11.1\include\net\sock.h
sock_common是套接口在网络层的最小表示,即最基本的网络层套接字信息。
struct sock_common {
/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned * address on 64bit arches : cf INET_MATCH() */
union {
__addrpair skc_addrpair;
struct {
__be32 skc_daddr; // 外部/目的IPV4地址
__be32 skc_rcv_saddr; // 本地绑定IPV4地址
};
};
union {
unsigned int skc_hash; // 根据协议查找表获取的哈希值
__u16 skc_u16hashes[2]; // 2个16位哈希值,UDP专用
};
/* skc_dport && skc_num must be grouped as well */
union {
__portpair skc_portpair; //
struct {
__be16 skc_dport; // inet_dport占位符
__u16 skc_num; // inet_num占位符
};
};
unsigned short skc_family; // 网络地址family
volatile unsigned char skc_state; // 连接状态
unsigned char skc_reuse:4; // SO_REUSEADDR 标记位
unsigned char skc_reuseport:1; // SO_REUSEPORT 标记位
unsigned char skc_ipv6only:1; // IPV6标记位
unsigned char skc_net_refcnt:1; // 该套接字网络名字空间内引用数
int skc_bound_dev_if; // 绑定设备索引
union {
struct hlist_node skc_bind_node; // 不同协议查找表组成的绑定哈希表
struct hlist_node skc_portaddr_node; // UDP/UDP-Lite protocol二级哈希表
};
struct proto *skc_prot; // 协议回调函数,根据协议不同而不同
......
union {
struct hlist_node skc_node; // 不同协议查找表组成的主哈希表
struct hlist_nulls_node skc_nulls_node; // UDP/UDP-Lite protocol主哈希表
};
unsigned short skc_tx_queue_mapping; // 该连接的传输队列
unsigned short skc_rx_queue_mapping; // 该连接的接受队列
......
union {
int skc_incoming_cpu; // 多核下处理该套接字数据包的CPU编号
u32 skc_rcv_wnd; // 接收窗口大小
u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */
};
refcount_t skc_refcnt; // 套接字引用计数
......
};
sk_buff
struct sk_buff这一结构体在各层协议中都会被用到。该结构体存储了网络数据报的所有信息。包括各层的头部以及 payload,以及必要的各层实现相关的信息。
struct sk_buff {
//sk_buff可以被组织成两种数据结构:双向链表和红黑树。且一个sk_buff不是在双向链表中,就是在红黑树中,因此,采用了 union 来节约空间。
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
//next 和 prev 两个域是用于双向链表的结构体,而 rbnode 是红黑树相关的结构
union {
struct net_device *dev;/* 与该包关联的网络设备 */
/* Some protocols might use this space to store information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
};
union {
struct sock *sk;/* 拥有该 sk_buff 的套接字的指针 */
int ip_defrag_offset;
};
union {
ktime_t tstamp;
u64 skb_mstamp_ns; /* earliest departure time */
};
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);/* 控制用的缓冲区,用于存放各层的私有数据 */
union {
struct {
unsigned long _skb_refdst;/* 存放了目的地项的引用计数 */
void (*destructor)(struct sk_buff *skb);/* 析构函数 */
};
struct list_head tcp_tsorted_anchor;
};
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct;
#endif
unsigned int len,//代表 buffer 中的数据报总长度(含各协议的头部),以及分片长度。
data_len;//代表分片中的数据的长度
__u16 mac_len,//MAC 层头部的长度
hdr_len;//克隆出来的可写的头部的长度
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
__u16 queue_mapping;/* 对于多队列设备的队列关系映射 */
/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
/* private: */
__u8 __cloned_offset[0];
/* public: */
__u8 cloned:1,/* 是否被克隆 */
nohdr:1,/* 只引用了负载 */
fclone:2,/* skbuff 克隆的情况 */
peeked:1,/* peeked 表明该包已经被统计过了,无需再次统计 */
head_frag:1,
pfmemalloc:1;
#ifdef CONFIG_SKB_EXTENSIONS
__u8 active_extensions;
#endif
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
//一个包的头部,这一部分再次使用了类似上面的方法,用了两个零长度的数组 headers_start和headers_end来标明头部的起始和终止地址
//在 __copy_skb_header() 中,只需使用一个 memcpy() 即可将 headers_start/end 之间的部分克隆一份
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
/* private: */
__u8 __pkt_type_offset[0];
/* public: */
__u8 pkt_type:3;/* 该包的类型 */
__u8 ignore_df:1;/* 是否允许本地分片 (local fragmentation) */
__u8 nf_trace:1;/* netfilter 包追踪标记 */
__u8 ip_summed:2;/* 驱动(硬件)给出来的 checksum */
__u8 ooo_okay:1;/* 允许该 socket 到队列的对应关系发生变更 */
__u8 l4_hash:1;/* 表明哈希值字段 hash 是一个典型的 4 元组的通过传输端口的哈希 */
__u8 sw_hash:1;/* 表明哈希值字段 hash 是通过软件栈计算出来的 */
__u8 wifi_acked_valid:1;/* 表明 wifi_acked 是否被设置了 */
__u8 wifi_acked:1;/* 表明帧是否在 wifi 上被确认了 */
__u8 no_fcs:1;/* 请求 NIC 将最后的 4 个字节作为以太网 FCS 来对待 */
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_VLAN_PRESENT_BIT 7
#else
#define PKT_VLAN_PRESENT_BIT 0
#endif
#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset)
/* private: */
__u8 __pkt_vlan_present_offset[0];
/* public: */
__u8 vlan_present:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_not_inet:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;/* 路由类型(来自链路层) */
#endif
__u8 ipvs_property:1;/* 标明该 skbuff 是否被 ipvs 拥有 */
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
__u8 offload_l3_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
#endif
#ifdef CONFIG_NET_REDIRECT
__u8 redirected:1;
__u8 from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
__u8 decrypted:1;
#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif
union {
__wsum csum;/* 校验码 */
struct {
__u16 csum_start;/* 从 skb->head 开始到应当计算校验码的起始位置的偏移 */
__u16 csum_offset;/* 从 csum_start 开始到存储校验码的位置的偏移 */
};
};
__u32 priority;/* 包队列的优先级 */
int skb_iif;/* 到达的设备的序号 */
__u32 hash;/* 包的哈希值 */
__be16 vlan_proto;/* vlan 包装协议 */
__u16 vlan_tci;/* vlan tag 控制信息 */
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;/* 表明该 skb 来源的 NAPI 结构体的 id */
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;/* 安全标记 */
#endif
union {
__u32 mark;/* 通用的包的标记位 */
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;/* 协议(封装好的) */
__u8 inner_ipproto;
};
__u16 inner_transport_header;/* 已封装的内部传输层头部 */
__u16 inner_network_header;/* 已封装的内部网络层头部 */
__u16 inner_mac_header;/* 已封装的内部链路层头部 */
__be16 protocol;/* 驱动(硬件)给出的包的协议类型 */
__u16 transport_header;/* 传输层头部 */
__u16 network_header;/* 网络层头部 */
__u16 mac_header;/* 数据链路层头部 */
#ifdef CONFIG_KCOV
u64 kcov_handle;
#endif
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
//最后是一组是管理相关的字段。其中,head和end 代表被分配的内存的起始位置和终止位置。而data和tail 则是实际数据的起始和终止位置。
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;//数据报的真实大小。
refcount_t users;//是引用计数,所以是个原子的
#ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
}