Linux TSO流程分析

Linux TSO流程分析1、TSO(transimitsegmentoffload)是针对tcp而言的,是指协议栈可以将tcp分段的操作offload到硬件的能力,本身需要硬件的支持。当网卡具有TSO能力时,上层协议栈可以直接下发一个超过MTU数据包,而把数据包拆分的动作交给硬件去做,节省cpu资源。除了TSO,内核还有一个GSO,GSO不区分协议类型,GSO默认是开启的,GSO是在软件上实现的一种延迟分段的技术,相比TSO,GSO最终还是需要协议栈自己完成分段的处理。即使网卡没有TSO能力,传输层依然可以封装一个超过M

1、TSO(transimit segment offload)是针对tcp而言的,是指协议栈可以将tcp 分段的操作offload到硬件的能力,本身需要硬件的支持。当网卡具有TSO能力时,上层协议栈可以直接下发一个超过MTU数据包,而把数据包拆分的动作交给硬件去做,节省cpu资源。除了TSO,内核还有一个GSO,GSO不区分协议类型,GSO默认是开启的,GSO是在软件上实现的一种延迟分段的技术,相比TSO,GSO最终还是需要协议栈自己完成分段的处理。

即使网卡没有TSO能力,传输层依然可以封装一个超过MTU的数据包,等数据包发送给驱动之前,检查网卡是否有TSO能力,如果没有,再调用ip层和传输层的分段处理函数完成数据包的分段处理,通过这样,内核将数据包的分段延迟到了dev链路层,提升数据包处理效率。当支持GSO/TSO时,skb的数据存放格式如下所示,在skb->end后,存在一个skb_share区域,skb的非线性区数据就存放在这里,GSO/TSO分段的处理就是要把skb数据(包括线性区、非线性区)按gso_size的大小进行分割处理;本文以虚拟网卡为例,介绍TSO的整体流程。

Linux TSO流程分析

2、驱动初始化过程

virtio驱动加载时,会根据qemu/vhost前后端feature协商的结果判断虚拟网卡是否有TSO能力,如果有,则在dev->hw_feature或上NETIF_F_TSO标志,然后赋给dev->features。

static int virtnet_probe(struct virtio_device *vdev)
{
		/* Individual feature bits: what can host handle? */
		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
			dev->hw_features |= NETIF_F_TSO;

                //gso默认为True
		if (gso)
			dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
}

3、注册虚拟网卡设备时,设置GSO能力。

virtnet_probe  —> register_netdev  —->register_netdevice

int register_netdevice(struct net_device *dev)
{
	dev->hw_features |= NETIF_F_SOFT_FEATURES;
        //dev->features是给协议栈用的
	dev->features |= NETIF_F_SOFT_FEATURES;
}

4、在发送端发起connect连接或三次握手建立完成(tcp_v4_syn_recv_sock),会开启GSO。

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
			       inet->inet_sport, inet->inet_dport, sk);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
		goto failure;
	}
	/* OK, now commit destination to socket.  */
        //设置GSO类型为tcpv4
	sk->sk_gso_type = SKB_GSO_TCPV4;
	sk_setup_caps(sk, &rt->dst);
}

tcp_v4_connect将sock的gso_type设置为tcpv4类型,然后调用sk_setup_caps,根据net_gso_ok返回值,判断是否支持GSO能力,正常这里是返回True。

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        //这个函数可以认为是检验是否具有tso能力,主要的调用地方有两个:
        //1、tcp层connect或三次握手完成时调用,这个调用流程里,如果开启GSO,则features同时会置上TSO,
        //   而GSO默认都是开启的,因此tcp层调用这个接口,会返回true
        //2、dev层将skb发送给驱动前调用,判断是否需要做TSO,这个调用流程里,features直接等于dev->features,
        //   如果网卡没有TSO能力,则features不会有TSO的标志,那这个函数就会返回false
    
	netdev_features_t feature = gso_type & SKB_GSO1_MASK;

	feature <<= NETIF_F_GSO_SHIFT;

	if (gso_type & SKB_GSO2_MASK) {
		netdev_features_t f = gso_type & SKB_GSO2_MASK;
		f <<= NETIF_F_GSO2_SHIFT;
		feature |= f;
	}

	/* check flags correspondence */
	BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_UDP     != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_IPIP    != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));

	/* GSO2 flags, see netdev_features.h */
	BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO2_SHIFT));
	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO2_SHIFT));
	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO2_SHIFT));
	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO2_SHIFT));
	BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO2_SHIFT));

	return (features & feature) == feature;
}

协议层校验支持gso后,会同时开启分散、聚合及csum校验能力。

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
	sk_dst_set(sk, dst);
	sk->sk_route_caps = dst->dev->features;
	if (sk->sk_route_caps & NETIF_F_GSO)
		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
	sk->sk_route_caps &= ~sk->sk_route_nocaps;
	if (sk_can_gso(sk)) {
		//skb头部需要额外空间,关闭GSO
		if (dst->header_len) {
			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
		} else {
			//开启skb的分散、聚合及csum功能,因为网卡做TSO的同时需要支持分散、聚合功能以及csum的重新计算能力
			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
			sk->sk_gso_max_size = dst->dev->gso_max_size;
			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
		}
	}
}

5、应用程序调用send发送数据包,send系统调用最终调用tcp_sendmsg,在tcp_sendmsg里判断是否支持GSO,支持的话将用户数据信息封装到skb的线性区或非线性区,封装完后的skb数据包就是一个大包了,然后调用tcp_push_one发送给IP层,当然发送之前还会调用check函数,根据csum的类型计算tcp层的csum,支持GSO、TSO情况下,tcp层只会计算伪头部的csum。

int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t size)
{
	struct iovec *iov;
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	int iovlen, flags, err, copied = 0;
	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
	bool sg;
	long timeo;

	lock_sock(sk);

	flags = msg->msg_flags;
	if (flags & MSG_FASTOPEN) {
		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
		if (err == -EINPROGRESS && copied_syn > 0)
			goto out;
		else if (err)
			goto out_err;
		offset = copied_syn;
	}

	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

	/* Wait for a connection to finish. One exception is TCP Fast Open
	 * (passive side) where data is allowed to be sent before a connection
	 * is fully established.
	 */
	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
	    !tcp_passive_fastopen(sk)) {
		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
			goto do_error;
	}

	if (unlikely(tp->repair)) {
		if (tp->repair_queue == TCP_RECV_QUEUE) {
			copied = tcp_send_rcvq(sk, msg, size);
			goto out_nopush;
		}

		err = -EINVAL;
		if (tp->repair_queue == TCP_NO_QUEUE)
			goto out_err;

		/* 'common' sending to sendq */
	}

	/* This should be in poll */
	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

	//获取mss,如果支持GSO,这里获取的是协商到的mss的整数倍
	mss_now = tcp_send_mss(sk, &size_goal, flags);

	/* Ok commence sending. */
	iovlen = msg->msg_iovlen;
	iov = msg->msg_iov;
	copied = 0;

	err = -EPIPE;
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
		goto out_err;

        //判断是否有分散、聚合能力
	sg = !!(sk->sk_route_caps & NETIF_F_SG);

	while (--iovlen >= 0) {
		size_t seglen = iov->iov_len;
		unsigned char __user *from = iov->iov_base;

		iov++;
		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
			if (offset >= seglen) {
				offset -= seglen;
				continue;
			}
			seglen -= offset;
			from += offset;
			offset = 0;
		}

		while (seglen > 0) {
			int copy = 0;
			int max = size_goal;

			//获取write队列的最后一个skb
			skb = tcp_write_queue_tail(sk);
			if (tcp_send_head(sk)) {
				if (skb->ip_summed == CHECKSUM_NONE)
					max = mss_now;
				//copy表示skb可以存放的最大的数据长度
				copy = max - skb->len;
			}

			//如果skb->len >= max,说明这个skb已经填满数据了;需要重新分配一个skb
			if (copy <= 0) {
new_segment:
				/* Allocate new segment. If the interface is SG,
				 * allocate skb fitting to single page.
				 */
				if (!sk_stream_memory_free(sk))
					goto wait_for_sndbuf;

				skb = sk_stream_alloc_skb(sk,
							  //获取skb头的长度
							  select_size(sk, sg),
							  sk->sk_allocation);
				if (!skb)
					goto wait_for_memory;

				/*
				 * Check whether we can use HW checksum.
				 */
				//在sk_setup_caps,已经置上的csum能力,设置ip_summed模式为CHECKSUM_PARTIAL
				//意思是协议栈只做ip头和伪头部的checksum,palyload需要硬件帮忙做
				if (sk->sk_route_caps & NETIF_F_CSUM_MASK)
					skb->ip_summed = CHECKSUM_PARTIAL;

				//分完一个skb后,将其加入sk->sk_write_queue队列中
				skb_entail(sk, skb);
				copy = size_goal;
				max = size_goal;

				/* All packets are restored as if they have
				 * already been sent. skb_mstamp isn't set to
				 * avoid wrong rtt estimation.
				 */
				if (tp->repair)
					TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
			}

			/* Try to append data to the end of skb. */
			//拷贝的数据最多不超过用户发送的消息长度
			if (copy > seglen)
				copy = seglen;

			/* Where to copy to? */
			//判断线性区是否还有空间
			if (skb_availroom(skb) > 0) {
				/* We have some space in skb head. Superb! */
				copy = min_t(int, copy, skb_availroom(skb));
				//将用户数据拷贝到线性区,并同步更新skb->len
				err = skb_add_data_nocache(sk, skb, from, copy);
				if (err)
					goto do_fault;
			} else {
				//线性区已经没有空间,将报文信息放到skinfo里
				bool merge = true;
				int i = skb_shinfo(skb)->nr_frags;
				struct page_frag *pfrag = sk_page_frag(sk);

				//sk_page_frag表示当前的skb_shinfo的最后一个frags,这里判断最后一个frags的page是否还有
				//空间可以存放数据(最小是32字节),如果没有,则重新分配一个page并放到pfrag->page里
				if (!sk_page_frag_refill(sk, pfrag))
					goto wait_for_memory;

				//判断pfrag->page是否是sk_page_frag指向的最后一个page,如果是,则表明上一步判断
				//sk_page_frag里的page还有足够空间可以存放数据;
				//如果不是,则表明上一步有重新分配过page页,把merge置为false,接下去需要把这个新分配的page页
				//添加到skb_shinfo里.
				if (!skb_can_coalesce(skb, i, pfrag->page,
						      pfrag->offset)) {
					if (i == MAX_SKB_FRAGS || !sg) {
						tcp_mark_push(tp, skb);
						goto new_segment;
					}
					merge = false;
				}

				//取需要拷贝的数据包长度与page剩余空间的最小值
				copy = min_t(int, copy, pfrag->size - pfrag->offset);

				if (!sk_wmem_schedule(sk, copy))
					goto wait_for_memory;

				//拷贝用户数据到pfrag->page里,并同步更新skb->len和skb->data_len,所以skb->data_len只表示非线性区的长度
				err = skb_copy_to_page_nocache(sk, from, skb,
							       pfrag->page,
							       pfrag->offset,
							       copy);
				if (err)
					goto do_error;

				/* Update the skb. */
				//如果是合并操作,则修改最后一个page的大小信息
				if (merge) {
					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				} else {
					//新分配的page页,添加到skb_shinfo(skb)->frags[i]里
					//同时将skb_shinfo(skb)->nr_frags值增1
					skb_fill_page_desc(skb, i, pfrag->page,
							   pfrag->offset, copy);
					//page引用计数加1
					get_page(pfrag->page);
				}
				pfrag->offset += copy;
			}

			if (!copied)
				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;

			tp->write_seq += copy;
			TCP_SKB_CB(skb)->end_seq += copy;
			skb_shinfo(skb)->gso_segs = 0;

			from += copy;
			copied += copy;
			if ((seglen -= copy) == 0 && iovlen == 0)
				goto out;

			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
				continue;

			if (forced_push(tp)) {
				tcp_mark_push(tp, skb);
				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
			} else if (skb == tcp_send_head(sk))
				tcp_push_one(sk, mss_now);
			continue;

wait_for_sndbuf:
			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
			if (copied)
				tcp_push(sk, flags & ~MSG_MORE, mss_now,
					 TCP_NAGLE_PUSH, size_goal);

			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				goto do_error;

			mss_now = tcp_send_mss(sk, &size_goal, flags);
		}
	}

out:
	if (copied)
		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
out_nopush:
	release_sock(sk);
	return copied + copied_syn;

do_fault:
	if (!skb->len) {
		tcp_unlink_write_queue(skb, sk);
		/* It is the one place in all of TCP, except connection
		 * reset, where we can be unlinking the send_head.
		 */
		tcp_check_send_head(sk, skb);
		sk_wmem_free_skb(sk, skb);
	}

do_error:
	if (copied + copied_syn)
		goto out;
out_err:
	err = sk_stream_error(sk, flags, err);
	release_sock(sk);
	return err;
}

6、在tcp_write_xmit流程里,通过tcp_init_tso_segs设置gso_size及分段个数gso_segs,其中gso_size为mss值,这两个参数用于告诉硬件做tso拆分时,需要拆分成的数据包个数及长度。

static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
				 unsigned int mss_now)
{
	struct skb_shared_info *shinfo = skb_shinfo(skb);

	/* Make sure we own this skb before messing gso_size/gso_segs */
	WARN_ON_ONCE(skb_cloned(skb));

	if (skb->len <= mss_now || !sk_can_gso(sk) ||
	    skb->ip_summed == CHECKSUM_NONE) {
		/* Avoid the costly divide in the normal
		 * non-TSO case.
		 */
		shinfo->gso_segs = 1;
		shinfo->gso_size = 0;
		shinfo->gso_type = 0;
	} else {
		//gso_segs为数据包总长度除mss
		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
		//gso_size为mss值,硬件做拆分时,会按gso_size的长度拆分每个数据包
		shinfo->gso_size = mss_now;
		shinfo->gso_type = sk->sk_gso_type;
	}
}

7、dev层发送给驱动之前,进一步校验网卡是否具有TSO能力,如果没有,则回调tcp的分段函数完成skb的分段处理,如果支持,则直接发送给驱动;

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
	netdev_features_t features;

	if (skb->next)
		return skb;

	features = netif_skb_features(skb);
	skb = validate_xmit_vlan(skb, features);
	if (unlikely(!skb))
		goto out_null;

	//这里的featrues就是dev->features,如果网卡不具有TSO,则dev->features不会有TSO的标志,这个函数返回true
	if (netif_needs_gso(skb, features)) {
		struct sk_buff *segs;

		segs = skb_gso_segment(skb, features);
		if (IS_ERR(segs)) {
			goto out_kfree_skb;
		} else if (segs) {
			consume_skb(skb);
			skb = segs;
		}
	} else {
		if (skb_needs_linearize(skb, features) &&
		    __skb_linearize(skb))
			goto out_kfree_skb;

		/* If packet is not checksummed and device does not
		 * support checksumming for this protocol, complete
		 * checksumming here.
		 */
		if (skb->ip_summed == CHECKSUM_PARTIAL) {
			if (skb->encapsulation)
				skb_set_inner_transport_header(skb,
							       skb_checksum_start_offset(skb));
			else
				skb_set_transport_header(skb,
							 skb_checksum_start_offset(skb));
			if (skb_csum_hwoffload_help(skb, features))
				goto out_kfree_skb;
		}
	}

	return skb;

out_kfree_skb:
	kfree_skb(skb);
out_null:
	return NULL;
}

8、如果需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum;

static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
					netdev_features_t features)
{
	bool udpfrag = false, fixedid = false, gso_partial, encap;
	struct sk_buff *segs = ERR_PTR(-EINVAL);
	const struct net_offload *ops;
	unsigned int offset = 0;
	struct iphdr *iph;
	int proto, tot_len;
	int nhoff;
	int ihl;
	int id;

	//设置ip头基于head的偏移
	skb_reset_network_header(skb);
	//ip头基于mac头的偏移,即使就是mac头的长度
	nhoff = skb_network_header(skb) - skb_mac_header(skb);
	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
		goto out;

	iph = ip_hdr(skb);
	ihl = iph->ihl * 4;
	if (ihl < sizeof(*iph))
		goto out;

	id = ntohs(iph->id);
	proto = iph->protocol;

	/* Warning: after this point, iph might be no longer valid */
	if (unlikely(!pskb_may_pull(skb, ihl)))
		goto out;
	//剥离ip头
	__skb_pull(skb, ihl);

	encap = SKB_GSO_CB(skb)->encap_level > 0;
	if (encap)
		features &= skb->dev->hw_enc_features;
	SKB_GSO_CB(skb)->encap_level += ihl;

	//设置tcp头基于head的偏移
	skb_reset_transport_header(skb);

	segs = ERR_PTR(-EPROTONOSUPPORT);

	if (!skb->encapsulation || encap) {
		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);

		/* fixed ID is invalid if DF bit is not set */
		if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
			goto out;
	}

	ops = rcu_dereference(inet_offloads[proto]);
	if (likely(ops && ops->callbacks.gso_segment))
		segs = ops->callbacks.gso_segment(skb, features);

	if (IS_ERR_OR_NULL(segs))
		goto out;

	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);

	skb = segs;
	do {
		//重新为每个分段skb设置ip头信息
		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
		if (udpfrag) {
			iph->frag_off = htons(offset >> 3);
			if (skb->next != NULL)
				iph->frag_off |= htons(IP_MF);
			offset += skb->len - nhoff - ihl;
			tot_len = skb->len - nhoff;
		} else if (skb_is_gso(skb)) {
			if (!fixedid) {
				iph->id = htons(id);
				id += skb_shinfo(skb)->gso_segs;
			}

			if (gso_partial)
				tot_len = skb_shinfo(skb)->gso_size +
					  SKB_GSO_CB(skb)->data_offset +
					  skb->head - (unsigned char *)iph;
			else
				tot_len = skb->len - nhoff;
		} else {
			if (!fixedid)
				iph->id = htons(id++);
			tot_len = skb->len - nhoff;
		}
		iph->tot_len = htons(tot_len);
		//为每个分段skb的ip头做checksum
		ip_send_check(iph);
		if (encap)
			skb_reset_inner_headers(skb);
		skb->network_header = (u8 *)iph - skb->head;
	} while ((skb = skb->next));

out:
	return segs;
}

9、进入tcp层分段处理函数后,会调用tcp_gso_segment完成skb分段,分段完成后,重新为每个分段skb做tcp层的checksum,以及为每个分段skb重新分配seq序列号等;

struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
				netdev_features_t features)
{
	struct sk_buff *segs = ERR_PTR(-EINVAL);
	unsigned int sum_truesize = 0;
	struct tcphdr *th;
	unsigned int thlen;
	unsigned int seq;
	__be32 delta;
	unsigned int oldlen;
	unsigned int mss;
	struct sk_buff *gso_skb = skb;
	__sum16 newcheck;
	bool ooo_okay, copy_destructor;

	th = tcp_hdr(skb);
	thlen = th->doff * 4;
	if (thlen < sizeof(*th))
		goto out;

	if (!pskb_may_pull(skb, thlen))
		goto out;

	oldlen = (u16)~skb->len;
	__skb_pull(skb, thlen);

	mss = skb_shinfo(skb)->gso_size;
	if (unlikely(skb->len <= mss))
		goto out;

	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
		/* Packet is from an untrusted source, reset gso_segs. */

		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);

		segs = NULL;
		goto out;
	}

	copy_destructor = gso_skb->destructor == tcp_wfree;
	ooo_okay = gso_skb->ooo_okay;
	/* All segments but the first should have ooo_okay cleared */
	skb->ooo_okay = 0;

        //真正做分段的处理函数
	segs = skb_segment(skb, features);
	if (IS_ERR(segs))
		goto out;

	/* Only first segment might have ooo_okay set */
	segs->ooo_okay = ooo_okay;

	/* GSO partial and frag_list segmentation only requires splitting
	 * the frame into an MSS multiple and possibly a remainder, both
	 * cases return a GSO skb. So update the mss now.
	 */
	if (skb_is_gso(segs))
		mss *= skb_shinfo(segs)->gso_segs;

	delta = htonl(oldlen + (thlen + mss));

	skb = segs;
	th = tcp_hdr(skb);
	seq = ntohl(th->seq);

	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
					       (__force u32)delta));

	while (skb->next) {
		th->fin = th->psh = 0;
		th->check = newcheck;

		//为每个分段skb做tcp层checksum
		if (skb->ip_summed == CHECKSUM_PARTIAL)
			gso_reset_checksum(skb, ~th->check);
		else
			th->check = gso_make_checksum(skb, ~th->check);

		//设置skb的序列号,拆分后,除了最后一个skb,其余大小均为mss
		seq += mss;
		if (copy_destructor) {
			skb->destructor = gso_skb->destructor;
			skb->sk = gso_skb->sk;
			sum_truesize += skb->truesize;
		}
		skb = skb->next;
		th = tcp_hdr(skb);

		th->seq = htonl(seq);
		th->cwr = 0;
	}

	/* Following permits TCP Small Queues to work well with GSO :
	 * The callback to TCP stack will be called at the time last frag
	 * is freed at TX completion, and not right now when gso_skb
	 * is freed by GSO engine
	 */
	if (copy_destructor) {
		swap(gso_skb->sk, skb->sk);
		swap(gso_skb->destructor, skb->destructor);
		sum_truesize += skb->truesize;
		atomic_add(sum_truesize - gso_skb->truesize,
			   &skb->sk->sk_wmem_alloc);
	}

	delta = htonl(oldlen + (skb_tail_pointer(skb) -
				skb_transport_header(skb)) +
		      skb->data_len);
	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
				(__force u32)delta));
	if (skb->ip_summed == CHECKSUM_PARTIAL)
		gso_reset_checksum(skb, ~th->check);
	else
		th->check = gso_make_checksum(skb, ~th->check);
out:
	return segs;
}

可以看到tcp_gso_segment里真正去做skb分段处理的是在skb_segment,skb_segment里将tso的skb按mss长度进行分段处理,对线性区域的数据,直接拷贝到分段skb的线性区域,对于非线性区域数据,直接将frags指针指向分段skb的frags;

struct sk_buff *skb_segment(struct sk_buff *head_skb,
			    netdev_features_t features)
{
	struct sk_buff *segs = NULL;
	struct sk_buff *tail = NULL;
	//flag_list存放ip分片数包,在ip_do_fragment里会设置
	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
	//frags存放分散、聚合的非线性区数据包
	skb_frag_t *frag = skb_shinfo(head_skb)->frags;
	unsigned int mss = skb_shinfo(head_skb)->gso_size;
	//doffset为ip头+mac头的大小之和
	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
	struct sk_buff *frag_skb = head_skb;
	unsigned int offset = doffset;
	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
	unsigned int partial_segs = 0;
	unsigned int headroom;
	unsigned int len = head_skb->len;
	__be16 proto;
	bool csum, sg;
	int nfrags = skb_shinfo(head_skb)->nr_frags;
	int err = -ENOMEM;
	int i = 0;
	int pos;
	int dummy;

	//为首个skb分配ip、mac头空间
	__skb_push(head_skb, doffset);
	proto = skb_network_protocol(head_skb, &dummy);
	if (unlikely(!proto))
		return ERR_PTR(-EINVAL);

	sg = !!(features & NETIF_F_SG);
	csum = !!can_checksum_protocol(features, proto);

	if (sg && csum && (mss != GSO_BY_FRAGS))  {
		if (!(features & NETIF_F_GSO_PARTIAL)) {
			struct sk_buff *iter;

			if (!list_skb ||
			    !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
				goto normal;

			/* Split the buffer at the frag_list pointer.
			 * This is based on the assumption that all
			 * buffers in the chain excluding the last
			 * containing the same amount of data.
			 */
			skb_walk_frags(head_skb, iter) {
				if (skb_headlen(iter))
					goto normal;

				len -= iter->len;
			}
		}

		/* GSO partial only requires that we trim off any excess that
		 * doesn't fit into an MSS sized block, so take care of that
		 * now.
		 */
		partial_segs = len / mss;
		if (partial_segs > 1)
			mss *= partial_segs;
		else
			partial_segs = 0;
	}

normal:
	headroom = skb_headroom(head_skb);
	//获取线性区的长度skb->len - skb->data_len
	pos = skb_headlen(head_skb);

	do {
		struct sk_buff *nskb;
		skb_frag_t *nskb_frag;
		int hsize;
		int size;

		if (unlikely(mss == GSO_BY_FRAGS)) {
			len = list_skb->len;
		} else {
			//没新增一个分段skb,offset累加分段skb的长度
			//len为下一个新增分段skb的长度, 最多不超过mss值
			len = head_skb->len - offset;
			if (len > mss)
				len = mss;
		}

		//skb_headlen = skb->len - skb->data_len表示skb的线性区域长度,当第一次分段时,
		//理论上线性区域就拷贝完成了,因此第二次的时候hsize应该就小于0了;
		//当hsize小于0时,直接将其赋值为0,接下来新分配的分段skb就不会再申请skb->data
		//线性空间了,而是直接拷贝非线性区的数据
		hsize = skb_headlen(head_skb) - offset;
		if (hsize < 0)
			hsize = 0;
		if (hsize > len || !sg)
			hsize = len;

		//拷贝ip分片的数据包
		if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
		    (skb_headlen(list_skb) == len || sg)) {
			BUG_ON(skb_headlen(list_skb) > len);

			i = 0;
			nfrags = skb_shinfo(list_skb)->nr_frags;
			frag = skb_shinfo(list_skb)->frags;
			frag_skb = list_skb;
			pos += skb_headlen(list_skb);

			while (pos < offset + len) {
				BUG_ON(i >= nfrags);

				size = skb_frag_size(frag);
				if (pos + size > offset + len)
					break;

				i++;
				pos += size;
				frag++;
			}

			nskb = skb_clone(list_skb, GFP_ATOMIC);
			list_skb = list_skb->next;

			if (unlikely(!nskb))
				goto err;

			if (unlikely(pskb_trim(nskb, len))) {
				kfree_skb(nskb);
				goto err;
			}

			hsize = skb_end_offset(nskb);
			if (skb_cow_head(nskb, doffset + headroom)) {
				kfree_skb(nskb);
				goto err;
			}

			nskb->truesize += skb_end_offset(nskb) - hsize;
			skb_release_head_state(nskb);
			__skb_push(nskb, doffset);
		} else {
			//拷贝skb的线性区和非线性区
			nskb = __alloc_skb(hsize + doffset + headroom,
					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
					   NUMA_NO_NODE);

			if (unlikely(!nskb))
				goto err;

			skb_reserve(nskb, headroom);
			__skb_put(nskb, doffset);
		}

		//segs为空时,nskb作为首个skb赋给segs,否则将新的nskb挂到next里
		if (segs)
			tail->next = nskb;
		else
			segs = nskb;
		tail = nskb;

		//从首个skb里拷贝skb头信息
		__copy_skb_header(nskb, head_skb);

		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
		//设置mac头长度
		skb_reset_mac_len(nskb);

		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
						 nskb->data - tnl_hlen,
						 doffset + tnl_hlen);

		if (nskb->len == len + doffset)
			goto perform_csum_check;

		if (!sg) {
			if (!nskb->remcsum_offload)
				nskb->ip_summed = CHECKSUM_NONE;
			SKB_GSO_CB(nskb)->csum =
				skb_copy_and_csum_bits(head_skb, offset,
						       skb_put(nskb, len),
						       len, 0);
			SKB_GSO_CB(nskb)->csum_start =
				skb_headroom(nskb) + doffset;
			continue;
		}

		nskb_frag = skb_shinfo(nskb)->frags;

		//将skb的线性区域拷贝到nskb上
		//hsize=0时,没有线性区域数据需要拷贝
		skb_copy_from_linear_data_offset(head_skb, offset,
						 skb_put(nskb, hsize), hsize);

		skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
			SKBTX_SHARED_FRAG;

		//pos的初始值为线性区长度,offset+len表示本地分段skb做完后,需要拷贝的总的数据长度,
		//当pos<offset+len时,表示线性区已经拷贝完了,但每次分段skb数据还没拷贝完成,
		//接来下就只能从非线性区拷贝了,每拷贝完一个frags,pos长度也相应增加对应
		//的frags长度,直到本次分段skb数据拷贝完成
		while (pos < offset + len) {
			//非线性区frags也拷贝完成,则从ip分片区拷贝
			if (i >= nfrags) {
				BUG_ON(skb_headlen(list_skb));

				i = 0;
				nfrags = skb_shinfo(list_skb)->nr_frags;
				frag = skb_shinfo(list_skb)->frags;
				frag_skb = list_skb;

				BUG_ON(!nfrags);

				list_skb = list_skb->next;
			}

			if (unlikely(skb_shinfo(nskb)->nr_frags >=
				     MAX_SKB_FRAGS)) {
				net_warn_ratelimited(
					"skb_segment: too many frags: %u %u\n",
					pos, mss);
				goto err;
			}

			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
				goto err;

			//拷贝非线性区时,并非是把原来的非线性区数据拷贝到新的分段skb的线性区
			//而是直接将分段skb的frags指针指向原来skb的frags
			*nskb_frag = *frag;
			__skb_frag_ref(nskb_frag);
			size = skb_frag_size(nskb_frag);

			if (pos < offset) {
				nskb_frag->page_offset += offset - pos;
				skb_frag_size_sub(nskb_frag, offset - pos);
			}

			skb_shinfo(nskb)->nr_frags++;
			//拷贝完成一个frags,修改pos长度
			if (pos + size <= offset + len) {
				i++;
				frag++;
				pos += size;
			} else {
				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
				goto skip_fraglist;
			}

			nskb_frag++;
		}

skip_fraglist:
		nskb->data_len = len - hsize;
		nskb->len += nskb->data_len;
		nskb->truesize += nskb->data_len;

perform_csum_check:
		if (!csum) {
			if (skb_has_shared_frag(nskb)) {
				err = __skb_linearize(nskb);
				if (err)
					goto err;
			}
			if (!nskb->remcsum_offload)
				nskb->ip_summed = CHECKSUM_NONE;
			SKB_GSO_CB(nskb)->csum =
				skb_checksum(nskb, doffset,
					     nskb->len - doffset, 0);
			SKB_GSO_CB(nskb)->csum_start =
				skb_headroom(nskb) + doffset;
		}
	} while ((offset += len) < head_skb->len); //拷贝的数据长度还没到整个skb的长度,进入下一次分段

	/* Some callers want to get the end of the list.
	 * Put it in segs->prev to avoid walking the list.
	 * (see validate_xmit_skb_list() for example)
	 */
	segs->prev = tail;

	if (partial_segs) {
		struct sk_buff *iter;
		int type = skb_shinfo(head_skb)->gso_type;
		unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

		/* Update type to add partial and then remove dodgy if set */
		type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
		type &= ~SKB_GSO_DODGY;

		/* Update GSO info and prepare to start updating headers on
		 * our way back down the stack of protocols.
		 */
		for (iter = segs; iter; iter = iter->next) {
			skb_shinfo(iter)->gso_size = gso_size;
			skb_shinfo(iter)->gso_segs = partial_segs;
			skb_shinfo(iter)->gso_type = type;
			SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
		}

		if (tail->len - doffset <= gso_size)
			skb_shinfo(tail)->gso_size = 0;
		else if (tail != segs)
			skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
	}

	/* Following permits correct backpressure, for protocols
	 * using skb_set_owner_w().
	 * Idea is to tranfert ownership from head_skb to last segment.
	 */
	if (head_skb->destructor == sock_wfree) {
		swap(tail->truesize, head_skb->truesize);
		swap(tail->destructor, head_skb->destructor);
		swap(tail->sk, head_skb->sk);
	}
	return segs;

err:
	kfree_skb_list(segs);
	return ERR_PTR(err);
}

10、分段处理完成后,返回分段的skb链表,然后将分段好的skb链表进一步发送(dev(qdisc) —>驱动 —–>网卡)。

今天的文章Linux TSO流程分析分享到此就结束了,感谢您的阅读。

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/7209.html

(0)
编程小号编程小号

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注