1、TSO(transimit segment offload)是针对tcp而言的,是指协议栈可以将tcp 分段的操作offload到硬件的能力,本身需要硬件的支持。当网卡具有TSO能力时,上层协议栈可以直接下发一个超过MTU数据包,而把数据包拆分的动作交给硬件去做,节省cpu资源。除了TSO,内核还有一个GSO,GSO不区分协议类型,GSO默认是开启的,GSO是在软件上实现的一种延迟分段的技术,相比TSO,GSO最终还是需要协议栈自己完成分段的处理。
即使网卡没有TSO能力,传输层依然可以封装一个超过MTU的数据包,等数据包发送给驱动之前,检查网卡是否有TSO能力,如果没有,再调用ip层和传输层的分段处理函数完成数据包的分段处理,通过这样,内核将数据包的分段延迟到了dev链路层,提升数据包处理效率。当支持GSO/TSO时,skb的数据存放格式如下所示,在skb->end后,存在一个skb_share区域,skb的非线性区数据就存放在这里,GSO/TSO分段的处理就是要把skb数据(包括线性区、非线性区)按gso_size的大小进行分割处理;本文以虚拟网卡为例,介绍TSO的整体流程。
2、驱动初始化过程
virtio驱动加载时,会根据qemu/vhost前后端feature协商的结果判断虚拟网卡是否有TSO能力,如果有,则在dev->hw_feature或上NETIF_F_TSO标志,然后赋给dev->features。
static int virtnet_probe(struct virtio_device *vdev)
{
/* Individual feature bits: what can host handle? */
if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
dev->hw_features |= NETIF_F_TSO;
//gso默认为True
if (gso)
dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO);
}
3、注册虚拟网卡设备时,设置GSO能力。
virtnet_probe —> register_netdev —->register_netdevice
int register_netdevice(struct net_device *dev)
{
dev->hw_features |= NETIF_F_SOFT_FEATURES;
//dev->features是给协议栈用的
dev->features |= NETIF_F_SOFT_FEATURES;
}
4、在发送端发起connect连接或三次握手建立完成(tcp_v4_syn_recv_sock),会开启GSO。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
goto failure;
}
/* OK, now commit destination to socket. */
//设置GSO类型为tcpv4
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
}
tcp_v4_connect将sock的gso_type设置为tcpv4类型,然后调用sk_setup_caps,根据net_gso_ok返回值,判断是否支持GSO能力,正常这里是返回True。
static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
//这个函数可以认为是检验是否具有tso能力,主要的调用地方有两个:
//1、tcp层connect或三次握手完成时调用,这个调用流程里,如果开启GSO,则features同时会置上TSO,
// 而GSO默认都是开启的,因此tcp层调用这个接口,会返回true
//2、dev层将skb发送给驱动前调用,判断是否需要做TSO,这个调用流程里,features直接等于dev->features,
// 如果网卡没有TSO能力,则features不会有TSO的标志,那这个函数就会返回false
netdev_features_t feature = gso_type & SKB_GSO1_MASK;
feature <<= NETIF_F_GSO_SHIFT;
if (gso_type & SKB_GSO2_MASK) {
netdev_features_t f = gso_type & SKB_GSO2_MASK;
f <<= NETIF_F_GSO2_SHIFT;
feature |= f;
}
/* check flags correspondence */
BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_IPIP != (NETIF_F_GSO_IPIP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
/* GSO2 flags, see netdev_features.h */
BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO2_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO2_SHIFT));
BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO2_SHIFT));
BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO2_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO2_SHIFT));
return (features & feature) == feature;
}
协议层校验支持gso后,会同时开启分散、聚合及csum校验能力。
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
sk->sk_route_caps &= ~sk->sk_route_nocaps;
if (sk_can_gso(sk)) {
//skb头部需要额外空间,关闭GSO
if (dst->header_len) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
//开启skb的分散、聚合及csum功能,因为网卡做TSO的同时需要支持分散、聚合功能以及csum的重新计算能力
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = dst->dev->gso_max_size;
sk->sk_gso_max_segs = dst->dev->gso_max_segs;
}
}
}
5、应用程序调用send发送数据包,send系统调用最终调用tcp_sendmsg,在tcp_sendmsg里判断是否支持GSO,支持的话将用户数据信息封装到skb的线性区或非线性区,封装完后的skb数据包就是一个大包了,然后调用tcp_push_one发送给IP层,当然发送之前还会调用check函数,根据csum的类型计算tcp层的csum,支持GSO、TSO情况下,tcp层只会计算伪头部的csum。
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
bool sg;
long timeo;
lock_sock(sk);
flags = msg->msg_flags;
if (flags & MSG_FASTOPEN) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
goto out_err;
offset = copied_syn;
}
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto do_error;
}
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
goto out_nopush;
}
err = -EINVAL;
if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
/* 'common' sending to sendq */
}
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
//获取mss,如果支持GSO,这里获取的是协商到的mss的整数倍
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
iov = msg->msg_iov;
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
//判断是否有分散、聚合能力
sg = !!(sk->sk_route_caps & NETIF_F_SG);
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
if (offset >= seglen) {
offset -= seglen;
continue;
}
seglen -= offset;
from += offset;
offset = 0;
}
while (seglen > 0) {
int copy = 0;
int max = size_goal;
//获取write队列的最后一个skb
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
//copy表示skb可以存放的最大的数据长度
copy = max - skb->len;
}
//如果skb->len >= max,说明这个skb已经填满数据了;需要重新分配一个skb
if (copy <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk,
//获取skb头的长度
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
//在sk_setup_caps,已经置上的csum能力,设置ip_summed模式为CHECKSUM_PARTIAL
//意思是协议栈只做ip头和伪头部的checksum,palyload需要硬件帮忙做
if (sk->sk_route_caps & NETIF_F_CSUM_MASK)
skb->ip_summed = CHECKSUM_PARTIAL;
//分完一个skb后,将其加入sk->sk_write_queue队列中
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
/* All packets are restored as if they have
* already been sent. skb_mstamp isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */
//拷贝的数据最多不超过用户发送的消息长度
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
//判断线性区是否还有空间
if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
//将用户数据拷贝到线性区,并同步更新skb->len
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
//线性区已经没有空间,将报文信息放到skinfo里
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
//sk_page_frag表示当前的skb_shinfo的最后一个frags,这里判断最后一个frags的page是否还有
//空间可以存放数据(最小是32字节),如果没有,则重新分配一个page并放到pfrag->page里
if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_memory;
//判断pfrag->page是否是sk_page_frag指向的最后一个page,如果是,则表明上一步判断
//sk_page_frag里的page还有足够空间可以存放数据;
//如果不是,则表明上一步有重新分配过page页,把merge置为false,接下去需要把这个新分配的page页
//添加到skb_shinfo里.
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
if (i == MAX_SKB_FRAGS || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
merge = false;
}
//取需要拷贝的数据包长度与page剩余空间的最小值
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
//拷贝用户数据到pfrag->page里,并同步更新skb->len和skb->data_len,所以skb->data_len只表示非线性区的长度
err = skb_copy_to_page_nocache(sk, from, skb,
pfrag->page,
pfrag->offset,
copy);
if (err)
goto do_error;
/* Update the skb. */
//如果是合并操作,则修改最后一个page的大小信息
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
//新分配的page页,添加到skb_shinfo(skb)->frags[i]里
//同时将skb_shinfo(skb)->nr_frags值增1
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
//page引用计数加1
get_page(pfrag->page);
}
pfrag->offset += copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
out_nopush:
release_sock(sk);
return copied + copied_syn;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
6、在tcp_write_xmit流程里,通过tcp_init_tso_segs设置gso_size及分段个数gso_segs,其中gso_size为mss值,这两个参数用于告诉硬件做tso拆分时,需要拆分成的数据包个数及长度。
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Make sure we own this skb before messing gso_size/gso_segs */
WARN_ON_ONCE(skb_cloned(skb));
if (skb->len <= mss_now || !sk_can_gso(sk) ||
skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
shinfo->gso_segs = 1;
shinfo->gso_size = 0;
shinfo->gso_type = 0;
} else {
//gso_segs为数据包总长度除mss
shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
//gso_size为mss值,硬件做拆分时,会按gso_size的长度拆分每个数据包
shinfo->gso_size = mss_now;
shinfo->gso_type = sk->sk_gso_type;
}
}
7、dev层发送给驱动之前,进一步校验网卡是否具有TSO能力,如果没有,则回调tcp的分段函数完成skb的分段处理,如果支持,则直接发送给驱动;
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
netdev_features_t features;
if (skb->next)
return skb;
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
goto out_null;
//这里的featrues就是dev->features,如果网卡不具有TSO,则dev->features不会有TSO的标志,这个函数返回true
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
} else if (segs) {
consume_skb(skb);
skb = segs;
}
} else {
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (skb_csum_hwoffload_help(skb, features))
goto out_kfree_skb;
}
}
return skb;
out_kfree_skb:
kfree_skb(skb);
out_null:
return NULL;
}
8、如果需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum;
static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
struct iphdr *iph;
int proto, tot_len;
int nhoff;
int ihl;
int id;
//设置ip头基于head的偏移
skb_reset_network_header(skb);
//ip头基于mac头的偏移,即使就是mac头的长度
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
iph = ip_hdr(skb);
ihl = iph->ihl * 4;
if (ihl < sizeof(*iph))
goto out;
id = ntohs(iph->id);
proto = iph->protocol;
/* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
//剥离ip头
__skb_pull(skb, ihl);
encap = SKB_GSO_CB(skb)->encap_level > 0;
if (encap)
features &= skb->dev->hw_enc_features;
SKB_GSO_CB(skb)->encap_level += ihl;
//设置tcp头基于head的偏移
skb_reset_transport_header(skb);
segs = ERR_PTR(-EPROTONOSUPPORT);
if (!skb->encapsulation || encap) {
udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
/* fixed ID is invalid if DF bit is not set */
if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
goto out;
}
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
segs = ops->callbacks.gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs))
goto out;
gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
skb = segs;
do {
//重新为每个分段skb设置ip头信息
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
tot_len = skb->len - nhoff;
} else if (skb_is_gso(skb)) {
if (!fixedid) {
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
if (gso_partial)
tot_len = skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)iph;
else
tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
tot_len = skb->len - nhoff;
}
iph->tot_len = htons(tot_len);
//为每个分段skb的ip头做checksum
ip_send_check(iph);
if (encap)
skb_reset_inner_headers(skb);
skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
return segs;
}
9、进入tcp层分段处理函数后,会调用tcp_gso_segment完成skb分段,分段完成后,重新为每个分段skb做tcp层的checksum,以及为每个分段skb重新分配seq序列号等;
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
__be32 delta;
unsigned int oldlen;
unsigned int mss;
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;
th = tcp_hdr(skb);
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
if (!pskb_may_pull(skb, thlen))
goto out;
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
segs = NULL;
goto out;
}
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
skb->ooo_okay = 0;
//真正做分段的处理函数
segs = skb_segment(skb, features);
if (IS_ERR(segs))
goto out;
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
/* GSO partial and frag_list segmentation only requires splitting
* the frame into an MSS multiple and possibly a remainder, both
* cases return a GSO skb. So update the mss now.
*/
if (skb_is_gso(segs))
mss *= skb_shinfo(segs)->gso_segs;
delta = htonl(oldlen + (thlen + mss));
skb = segs;
th = tcp_hdr(skb);
seq = ntohl(th->seq);
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
while (skb->next) {
th->fin = th->psh = 0;
th->check = newcheck;
//为每个分段skb做tcp层checksum
if (skb->ip_summed == CHECKSUM_PARTIAL)
gso_reset_checksum(skb, ~th->check);
else
th->check = gso_make_checksum(skb, ~th->check);
//设置skb的序列号,拆分后,除了最后一个skb,其余大小均为mss
seq += mss;
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);
th->seq = htonl(seq);
th->cwr = 0;
}
/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
* is freed at TX completion, and not right now when gso_skb
* is freed by GSO engine
*/
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
sum_truesize += skb->truesize;
atomic_add(sum_truesize - gso_skb->truesize,
&skb->sk->sk_wmem_alloc);
}
delta = htonl(oldlen + (skb_tail_pointer(skb) -
skb_transport_header(skb)) +
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
if (skb->ip_summed == CHECKSUM_PARTIAL)
gso_reset_checksum(skb, ~th->check);
else
th->check = gso_make_checksum(skb, ~th->check);
out:
return segs;
}
可以看到tcp_gso_segment里真正去做skb分段处理的是在skb_segment,skb_segment里将tso的skb按mss长度进行分段处理,对线性区域的数据,直接拷贝到分段skb的线性区域,对于非线性区域数据,直接将frags指针指向分段skb的frags;
struct sk_buff *skb_segment(struct sk_buff *head_skb,
netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
//flag_list存放ip分片数包,在ip_do_fragment里会设置
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
//frags存放分散、聚合的非线性区数据包
skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
//doffset为ip头+mac头的大小之和
unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int partial_segs = 0;
unsigned int headroom;
unsigned int len = head_skb->len;
__be16 proto;
bool csum, sg;
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
int dummy;
//为首个skb分配ip、mac头空间
__skb_push(head_skb, doffset);
proto = skb_network_protocol(head_skb, &dummy);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
if (sg && csum && (mss != GSO_BY_FRAGS)) {
if (!(features & NETIF_F_GSO_PARTIAL)) {
struct sk_buff *iter;
if (!list_skb ||
!net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
goto normal;
/* Split the buffer at the frag_list pointer.
* This is based on the assumption that all
* buffers in the chain excluding the last
* containing the same amount of data.
*/
skb_walk_frags(head_skb, iter) {
if (skb_headlen(iter))
goto normal;
len -= iter->len;
}
}
/* GSO partial only requires that we trim off any excess that
* doesn't fit into an MSS sized block, so take care of that
* now.
*/
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
else
partial_segs = 0;
}
normal:
headroom = skb_headroom(head_skb);
//获取线性区的长度skb->len - skb->data_len
pos = skb_headlen(head_skb);
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
int hsize;
int size;
if (unlikely(mss == GSO_BY_FRAGS)) {
len = list_skb->len;
} else {
//没新增一个分段skb,offset累加分段skb的长度
//len为下一个新增分段skb的长度, 最多不超过mss值
len = head_skb->len - offset;
if (len > mss)
len = mss;
}
//skb_headlen = skb->len - skb->data_len表示skb的线性区域长度,当第一次分段时,
//理论上线性区域就拷贝完成了,因此第二次的时候hsize应该就小于0了;
//当hsize小于0时,直接将其赋值为0,接下来新分配的分段skb就不会再申请skb->data
//线性空间了,而是直接拷贝非线性区的数据
hsize = skb_headlen(head_skb) - offset;
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg)
hsize = len;
//拷贝ip分片的数据包
if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
pos += skb_headlen(list_skb);
while (pos < offset + len) {
BUG_ON(i >= nfrags);
size = skb_frag_size(frag);
if (pos + size > offset + len)
break;
i++;
pos += size;
frag++;
}
nskb = skb_clone(list_skb, GFP_ATOMIC);
list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
if (unlikely(pskb_trim(nskb, len))) {
kfree_skb(nskb);
goto err;
}
hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
//拷贝skb的线性区和非线性区
nskb = __alloc_skb(hsize + doffset + headroom,
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
skb_reserve(nskb, headroom);
__skb_put(nskb, doffset);
}
//segs为空时,nskb作为首个skb赋给segs,否则将新的nskb挂到next里
if (segs)
tail->next = nskb;
else
segs = nskb;
tail = nskb;
//从首个skb里拷贝skb头信息
__copy_skb_header(nskb, head_skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
//设置mac头长度
skb_reset_mac_len(nskb);
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen,
doffset + tnl_hlen);
if (nskb->len == len + doffset)
goto perform_csum_check;
if (!sg) {
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
len, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
continue;
}
nskb_frag = skb_shinfo(nskb)->frags;
//将skb的线性区域拷贝到nskb上
//hsize=0时,没有线性区域数据需要拷贝
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;
//pos的初始值为线性区长度,offset+len表示本地分段skb做完后,需要拷贝的总的数据长度,
//当pos<offset+len时,表示线性区已经拷贝完了,但每次分段skb数据还没拷贝完成,
//接来下就只能从非线性区拷贝了,每拷贝完一个frags,pos长度也相应增加对应
//的frags长度,直到本次分段skb数据拷贝完成
while (pos < offset + len) {
//非线性区frags也拷贝完成,则从ip分片区拷贝
if (i >= nfrags) {
BUG_ON(skb_headlen(list_skb));
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
BUG_ON(!nfrags);
list_skb = list_skb->next;
}
if (unlikely(skb_shinfo(nskb)->nr_frags >=
MAX_SKB_FRAGS)) {
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
goto err;
}
if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
goto err;
//拷贝非线性区时,并非是把原来的非线性区数据拷贝到新的分段skb的线性区
//而是直接将分段skb的frags指针指向原来skb的frags
*nskb_frag = *frag;
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);
if (pos < offset) {
nskb_frag->page_offset += offset - pos;
skb_frag_size_sub(nskb_frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
//拷贝完成一个frags,修改pos长度
if (pos + size <= offset + len) {
i++;
frag++;
pos += size;
} else {
skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
goto skip_fraglist;
}
nskb_frag++;
}
skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
perform_csum_check:
if (!csum) {
if (skb_has_shared_frag(nskb)) {
err = __skb_linearize(nskb);
if (err)
goto err;
}
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len); //拷贝的数据长度还没到整个skb的长度,进入下一次分段
/* Some callers want to get the end of the list.
* Put it in segs->prev to avoid walking the list.
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;
if (partial_segs) {
struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
for (iter = segs; iter; iter = iter->next) {
skb_shinfo(iter)->gso_size = gso_size;
skb_shinfo(iter)->gso_segs = partial_segs;
skb_shinfo(iter)->gso_type = type;
SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
}
if (tail->len - doffset <= gso_size)
skb_shinfo(tail)->gso_size = 0;
else if (tail != segs)
skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
*/
if (head_skb->destructor == sock_wfree) {
swap(tail->truesize, head_skb->truesize);
swap(tail->destructor, head_skb->destructor);
swap(tail->sk, head_skb->sk);
}
return segs;
err:
kfree_skb_list(segs);
return ERR_PTR(err);
}
10、分段处理完成后,返回分段的skb链表,然后将分段好的skb链表进一步发送(dev(qdisc) —>驱动 —–>网卡)。
今天的文章Linux TSO流程分析分享到此就结束了,感谢您的阅读。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/7209.html