上节首先回顾了网络分层及网络的基础知识,然后针对tcp/udp及socket编程进行了回顾,后续又梳理了socket建立连接的5个函数即socket,bind,listen,connect,accept的实现,以及tcp连接建立过程,本节将继续探讨send与recv的实现及报文发送接收相关内容。与socket,bind,listen,accept,connect等函数不同,数据报文的发送接收牵扯到各个内核子系统,报文从网卡到用户态程序的路径很长,有socect层,传输层,网络层,路由子系统,邻居子系统,设备子系统,网络设备驱动层等。接下来将以tcp报文发送接收为例,梳理发送接收流程
ssize_t send(int sockfd, const void *buf, size_t len, int flags);
ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen);
ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);
这三个函数用于传输信息到另一个socket。
send只能用于基于连接的套接字。
send与write的区别仅仅在与flags参数,当flags为0时send和write是等价的。
send(sockfd, buf, len, flags);和sendto(sockfd, buf, len, flags, NULL, 0);也是等价的
sendmsg的地址在msg.msg_name中。
对应的发送函数,与send系列基本一致:
ssize_t recv(int sockfd, void *buf, size_t len, int flags);
ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags, sockaddr *src_addr, socklen_t *addrlen);
ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags);
socket 对于用户来讲,是一个文件一样的存在,拥有一个文件描述符。因而对于网络包的发送,除了可以使用send系列函数,我们还可以使用对于 socket 文件的写入系统调用,也就是 write 系统调用。
write会调用ksys_write,然后这个函数再调用vfs_write,而其实际调用了file->f_op->write_iter
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
//...
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
//...
}
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
//...
ret = call_write_iter(filp, &kiocb, &iter);
//...
}
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->write_iter(kio, iter);
}
而在上一章分析创建socket的函数sock_map_fd中有个sock_alloc_file就是来创建scoket对应的file指针的
/**
* sock_alloc_file - Bind a &socket to a &file
* @sock: socket
* @flags: file status flags
* @dname: protocol name
*
* Returns the &file bound with @sock, implicitly storing it
* in sock->file. If dname is %NULL, sets to "".
* On failure the return is a ERR pointer (see linux/err.h).
* This function uses GFP_KERNEL internally.
*/
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct file *file;
//...
file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
O_RDWR | (flags & O_NONBLOCK),
&socket_file_ops);
//...
sock->file = file;
file->private_data = sock;
stream_open(SOCK_INODE(sock), file);
return file;
}
struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
const struct file_operations *fops)
{
//...
file = alloc_file(&path, flags, fops);
//...
}
/**
* alloc_file - allocate and initialize a 'struct file'
*
* @path: the (dentry, vfsmount) pair for the new file
* @flags: O_... flags with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
static struct file *alloc_file(const struct path *path, int flags,
const struct file_operations *fop)
{
struct file *file;
file = alloc_empty_file(flags, current_cred());
//...
file->f_op = fop;
//...
}
可以看到socket对应的file指针的file->f_op就是socket_file_ops
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
.show_fdinfo = sock_show_fdinfo,
};
因此vfs_write中调用的file->f_op->write_iter就是sock_write_iter,也就是sock_sendmsg
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
//...
res = sock_sendmsg(sock, &msg);
//...
}
自此,对socket的write调用就转到socket层处理
再看sock_sendmsg,实际调用send时也会调用这个函数
/**
* sock_sendmsg - send a message through @sock
* @sock: socket
* @msg: message to send
*
* Sends @msg through @sock, passing through LSM.
* Returns the number of bytes sent, or an error code.
*/
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
//...
return err ?: sock_sendmsg_nosec(sock, msg);
}
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg,
inet_sendmsg, sock, msg,
msg_data_left(msg));
//...
}
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
//...
return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,sk, msg, size);
}
最终和上一章的函数一样,调到了proto里面的sendmsg,转至传输层处理
此部分函数流程非常长,可参考下面代码里的注释
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
int ret;
lock_sock(sk);
ret = tcp_sendmsg_locked(sk, msg, size);
release_sock(sk);
return ret;
}
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
bool zc = false;
long timeo;
//...
/* Ok commence sending. */
//copied表示从用户要写入的数据拷贝了多少数据到struct sk_buff的管辖范围内
copied = 0;
restart:
//Max Segment Size,最大分段大小。计算在网络上传输的网络包的大小限制,这个限制在最底层开始就有
//其值等于二层中的MTU 减去 IP 头,再减去 TCP 头。也就是,在不分片的情况下,TCP 里面放的最大内容。
mss_now = tcp_send_mss(sk, &size_goal, flags);
//...
//发送循环,如果用户的数据没有发送完毕,就一直循环
while (msg_data_left(msg)) {
//本地拷贝的数值
int copy = 0;
//取TCP写入队列sock->sk_write_queue的最后一个struct sk_buff
//这里面只有最后一个,可能会因为上次用户给的数据太少,而没有填满
skb = tcp_write_queue_tail(sk);
//计算skb的剩余数据空间
if (skb)
copy = size_goal - skb->len;
//skb中已无空间,需要分配新的skb
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_space;
//循环了超过16次了,先处理一次sk_backlog,然后重新开始流程
if (unlikely(process_backlog >= 16)) {
process_backlog = 0;
if (sk_flush_backlog(sk))
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
//重新分配 struct sk_buff
//sk_stream_alloc_skb 会最终调用到 __alloc_skb。在这个函数里面,除了分配一个 sk_buff 结构之外,还要分配 sk_buff 指向的数据区域。
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_space;
process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
//调用tcp_add_write_queue_tail将新分配的 sk_buff 放到队列尾部
skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
* already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
/* Where to copy to? */
if (skb_availroom(skb) > 0 && !zc) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
//将数据拷贝到连续的数据区域
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
} else if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
if (i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
merge = false;
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_space;
//将数据拷贝到 struct skb_shared_info 结构指向的不需要连续的页面区域。
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
pfrag->offset,
copy);
if (err)
goto do_error;
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
} else {
//...
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
//累加拷贝的数据
copied += copy;
if (!msg_data_left(msg)) {
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out;
}
if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
//以下两个分支都会调用tcp_write_xmit发送数据包
//积累的数据报数目太多了,需要通过调用 __tcp_push_pending_frames 发送网络包。
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
//第一个网络包,需要马上发送,调用 tcp_push_one。
tcp_push_one(sk, mss_now);
continue;
//...
}
msg 是用户要写入的数据,这个数据要拷贝到内核协议栈里面去发送;在内核协议栈里面,网络包的数据都是由 struct sk_buff 维护的,因而第一件事情就是找到一个空闲的内存空间,将用户要写入的数据,拷贝到 struct sk_buff 的管辖范围内。而第二件事情就是发送 struct sk_buff。
sk_buff定义的布局如下:
struct sk_buff {
union {
struct {
/* These two members must be first. */
//链表,串联sk_buff
struct sk_buff *next;
struct sk_buff *prev;
union {
struct net_device *dev;
/* Some protocols might use this space to store information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
};
//...
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
//从 headers_start 开始,到 headers_end 结束,里面都是各层次的头的位置。这里面有二层的 mac_header、三层的 network_header 和四层的 transport_header。
__u32 headers_start[0];
/* public: */
//...
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
//...
union {
__u32 mark;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head, *data;
unsigned int truesize;
refcount_t users;
//...
};
其中tail 指向数据的结尾,end 指向分配的内存块的结束地址。
head 指向分配的内存块起始地址。
data 这个指针指向的位置是可变的。它有可能随着报文所处的层次而变动:当接收报文时,从网卡驱动开始,通过协议栈层层往上传送数据报,通过增加 skb->data 的值,来逐步剥离协议首部。而要发送报文时,各协议会创建 sk_buff{},在经过各下层协议时,通过减少 skb->data 的值来增加协议首部。
关于结构体中的transport_header等头部字段,直接指向实际的头部:
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
//...
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
//...
}
/**
* skb_push - add data to the start of a buffer
* @skb: buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the buffer at the buffer
* start. If this would exceed the total buffer headroom the kernel will
* panic. A pointer to the first byte of the extra data is returned.
*/
void *skb_push(struct sk_buff *skb, unsigned int len)
{
//传输层、网络层、数据链路层的包是一层层包裹的,所以这里是减len
skb->data -= len;
skb->len += len;
if (unlikely(skb->data < skb->head))
skb_under_panic(skb, len, __builtin_return_address(0));
return skb->data;
}
static inline void skb_reset_transport_header(struct sk_buff *skb)
{
skb->transport_header = skb->data - skb->head;
}
sk_buff指向的数据分为两部分,第一部分是连续的数据区域(head指向)。紧接着是第二部分,一个 struct skb_shared_info 结构。
这个skb_shared_info结构是对于网络包发送过程的一个优化,因为传输层之上就是应用层了。按照 TCP 的定义,应用层感受不到下面的网络层的 IP 包是一个个独立的包的存在的。反正就是一个流,往里写就是了,可能一下子写多了,超过了一个 IP 包的承载能力,就会出现上面 MSS 的定义,拆分成一个个的 Segment 放在一个个的 IP 包里面,也可能一次写一点,一次写一点,这样数据是分散的,在 IP 层还要通过内存拷贝合成一个 IP 包。为了减少内存拷贝的代价,有的网络设备支持分散聚合(Scatter/Gather)I/O,顾名思义,就是 IP 层没必要通过内存拷贝进行聚合,让散的数据零散的放在原处,在设备层进行聚合。如果使用这种模式,网络包的数据就不会放在连续的数据区域,而是放在 struct skb_shared_info 结构里面指向的离散数据,skb_shared_info 的成员变量 skb_frag_t frags[MAX_SKB_FRAGS],会指向一个数组的页面,就不能保证连续了。
skb_shared_info的创建过程如下:
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
* @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
* instead of head cache and allocate a cloned (child) skb.
* If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
* allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
* tail room of at least size bytes. The object has a reference count
* of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int flags, int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
bool pfmemalloc;
//...
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
}
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
return skb->head + skb->end;
}
判断放在连续区域空间大小的代码如下,
/**
* skb_availroom - bytes at buffer end
* @skb: buffer to check
*
* Return the number of bytes of free space at the tail of an sk_buff
* allocated by sk_stream_alloc()
*/
static inline int skb_availroom(const struct sk_buff *skb)
{
if (skb_is_nonlinear(skb))
return 0;
return skb->end - skb->tail - skb->reserved_tailroom;
}
而关于不连续区域的个数MAX_SKB_FRAGS相关代码:
//允许小于64K的数据不用分段,即不适用frag_list
/* To allow 64K frame to be packed as single skb without frag_list we
* require 64K/PAGE_SIZE pages plus 1 additional page to allow for
* buffers which do not start on a page boundary.
*
* Since GRO uses frags we allocate at least 16 regardless of page
* size.
*/
//哪个分支?
#if (65536/PAGE_SIZE + 1) < 16
#define MAX_SKB_FRAGS 16UL
#else
#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
#endif
/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT
#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
#define CONFIG_ARM64_PAGE_SHIFT 12
另外,关于一个skb存储报文大小,一个 skb 可能包含一个或多个 MSS(最大报文段长度)的数据,特别是在开启 GSO/TSO(大包分片/分段卸载)时,一个 skb 可能代表多个 TCP 段,但通常 skb 还是以“一个 TCP 段”为基本单位进行处理和发送的。
TSO的全拼为TCP Segmentation Offload。如果发送的网络包非常大,要进行分段。分段这个事情可以由协议栈代码在内核做,但是缺点是比较费 CPU,另一种方式是延迟到硬件网卡去做,需要网卡支持对大数据包进行自动分段,可以降低 CPU 负载。下面流程中有分段及tcp传输算法,比如nagle,滑动窗口等,具体细节先提过
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
* LARGESEND note: !tcp_urg_mode is overkill, only frames between
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
* Send at most one packet when push_one > 0. Temporarily ignore
* cwnd limit to force at most one packet out when push_one == 2.
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
//...
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
//...
//若skb->len <= mss_now,不需要分段,否则计算skb->len除以mss_now,得到分段数量和大小
tso_segs = tcp_init_tso_segs(skb, mss_now);
//...
//拥塞窗口cwnd,congestion window
cwnd_quota = tcp_cwnd_test(tp, skb);
//...
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
break;
}
//...
limit = mss_now;
//计算limit
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
max_segs),
nonagle);
//判断是就地在协议栈分段还是等到底层网卡再分
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
//...
//发送报文
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
sent_pkts += tcp_skb_pcount(skb);
if (push_one)
break;
}
//...
}
在这里做了填充TCP头,然后调用queue_xmit发送到网络层
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
tcp_sk(sk)->rcv_nxt);
}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
//...
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
//...
//计算tcp包头长度
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
//...
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk = sk;//赋值sk
//...
//填充TCP头
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;//源端口号
th->dest = inet->inet_dport;//目的端口号
th->seq = htonl(tcb->seq);//序列号
th->ack_seq = htonl(rcv_nxt);//确认序列号
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
th->check = 0;
th->urg_ptr = 0;
//...
tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
//...
/* BPF prog is the last one writing header option */
bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
//...
err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
inet6_csk_xmit, ip_queue_xmit,
sk, skb, &inet->cork.fl);
//...
}
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
//创建socket时初始化的
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
return 0;
}
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.mtu_reduced = tcp_v4_mtu_reduced,
};
最后实际调用ip_queue_xmit把包发送到网络层
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
}
/* Note: skb->sk can be different from sk, in case of tunnels */
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
//有路由缓存,直接发送
rt = skb_rtable(skb);
if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (!rt) {
__be32 daddr;
//...
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
//查找路由,确定发包的网卡
rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
//封装ip头
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
//是否允许分片
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
//ttl
iph->ttl = ip_select_ttl(inet, &rt->dst);
//ip上层承载协议
iph->protocol = sk->sk_protocol;
//源目的ip
ip_copy_addrs(iph, fl4);
/* Transport layer set skb->h.foo itself. */
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
查找路由部分的调用为ip_route_output_ports->ip_route_output_flow->__ip_route_output_key->ip_route_output_key_hash->ip_route_output_key_hash_rcu
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
struct fib_result *res,
const struct sk_buff *skb)
{
struct net_device *dev_out = NULL;
int orig_oif = fl4->flowi4_oif;
unsigned int flags = 0;
struct rtable *rth;
int err;
//...
err = fib_lookup(net, fl4, res, 0);
//...
fib_select_path(net, res, fl4, skb);
//出接口
dev_out = FIB_RES_DEV(*res);
make_route:
//创建路由表项
rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
out:
return rth;
}
使用Trie 树进行路由最长前缀匹配查找
static inline int fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
struct fib_table *tb;
int err = -ENETUNREACH;
flags |= FIB_LOOKUP_NOREF;
if (net->ipv4.fib_has_custom_rules)
return __fib_lookup(net, flp, res, flags);
rcu_read_lock();
res->tclassid = 0;
//先查询主表
tb = rcu_dereference_rtnl(net->ipv4.fib_main);
if (tb)
err = fib_table_lookup(tb, flp, res, flags);
if (!err)
goto out;
//再查默认路由表
tb = rcu_dereference_rtnl(net->ipv4.fib_default);
if (tb)
err = fib_table_lookup(tb, flp, res, flags);
out:
if (err == -EAGAIN)
err = -ENETUNREACH;
rcu_read_unlock();
return err;
}
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
const struct flowi4 *fl4, int orig_oif,
struct net_device *dev_out,
unsigned int flags)
{
struct fib_info *fi = res->fi;
struct fib_nh_exception *fnhe;
struct in_device *in_dev;
u16 type = res->type;
struct rtable *rth;
bool do_cache;
//...
add:
rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM));
//...
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
//...
}
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
bool nopolicy, bool noxfrm)
{
struct rtable *rt;
//dev在此赋值
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
if (rt) {
rt->rt_genid = rt_genid_ipv4(dev_net(dev));
rt->rt_flags = flags;
rt->rt_type = type;
rt->rt_is_input = 0;
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
rt->rt_uses_gateway = 0;
rt->rt_gw_family = 0;
rt->rt_gw4 = 0;
INIT_LIST_HEAD(&rt->rt_uncached);
//此处设置回调
rt->dst.output = ip_output;
if (flags & RTCF_LOCAL)
rt->dst.input = ip_local_deliver;
}
return rt;
}
调用了ip_local_out发送ip包
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_out(sk, skb);
if (unlikely(!skb))
return 0;
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
调用dst_output
/* Output packet to network from transport. */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return skb_dst(skb)->output(net, sk, skb);
}
/**
* skb_dst - returns skb dst_entry
* @skb: buffer
*
* Returns skb dst_entry, regardless of reference taken or not.
*/
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
/* If refdst was not refcounted, check we still are in a
* rcu_read_lock section
*/
WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
!rcu_read_lock_held() &&
!rcu_read_lock_bh_held());
return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}
这里调用的就是 struct rtable 成员 dst 的 ouput 函数。结合前面的rt_dst_alloc可知,output 函数指向的是 ip_output。
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
//将net_device赋值给skb
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
switch (ret) {
case NET_XMIT_SUCCESS:
return __ip_finish_output(net, sk, skb);
case NET_XMIT_CN:
return __ip_finish_output(net, sk, skb) ? : ret;
default:
kfree_skb(skb);
return ret;
}
}
static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
//...
mtu = ip_skb_dst_mtu(sk, skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
//ip分片
if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
}
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
//路由表项在此处取得
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
//...
rcu_read_lock_bh();
//查找邻居表
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
sock_confirm_neigh(skb, neigh);
/* if crossing protocols, can not use the cached header */
//发送
res = neigh_output(neigh, skb, is_v6gw);
//...
}
static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
struct sk_buff *skb,
bool *is_v6gw)
{
struct net_device *dev = rt->dst.dev;
struct neighbour *neigh;
if (likely(rt->rt_gw_family == AF_INET)) {
neigh = ip_neigh_gw4(dev, rt->rt_gw4);
//...
return neigh;
}
static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
__be32 daddr)
{
struct neighbour *neigh;
neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &daddr, dev, false);
return neigh;
}
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
key = INADDR_ANY;
return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
static inline struct neighbour *___neigh_lookup_noref(
struct neigh_table *tbl,
bool (*key_eq)(const struct neighbour *n, const void *pkey),
__u32 (*hash)(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd),
const void *pkey,
struct net_device *dev)
{
struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht);
struct neighbour *n;
u32 hash_val;
//pkey即为目地地址
hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (n->dev == dev && key_eq(n, pkey))
return n;
}
return NULL;
}
arp_tbl定义
struct neigh_table arp_tbl = {
.family = AF_INET,
.key_len = 4,
.protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
.key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.is_multicast = arp_is_multicast,
.id = "arp_cache",
//...
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
若未查到,则会创建
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
return ___neigh_create(tbl, pkey, dev, 0, false, want_ref);
}
static struct neighbour *
___neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, u8 flags,
bool exempt_from_gc, bool want_ref)
{
u32 hash_val, key_len = tbl->key_len;
struct neighbour *n1, *rc, *n;
struct neigh_hash_table *nht;
int error;
//创建struct neighbour 结构,用于维护 MAC 地址和 ARP 相关的信息
n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
//arp_tbl的constructor
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
//...
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
//...
//将创建的邻居表项存入hash,上面查找就是从这个hash查的
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
//...
}
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev,
u8 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
//...
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
//上层想通过 ARP 获取 MAC 地址的任务,都放在arp_queue队列里面
__skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
n->flags = flags;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
//过一段时间就调用 neigh_timer_handler,来处理ARP 任务
timer_setup(&n->timer, neigh_timer_handler, 0);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
refcount_set(&n->refcnt, 1);
n->dead = 1;
//...
}
static int arp_constructor(struct neighbour *neigh)
{
__be32 addr;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
u32 inaddr_any = INADDR_ANY;
//...
neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
//...
neigh->ops = &arp_hh_ops;
//...
neigh->output = neigh->ops->output;
//...
}
static const struct neigh_ops arp_hh_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
};
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
bool skip_cache)
{
const struct hh_cache *hh = &n->hh;
/* n->nud_state and hh->hh_len could be changed under us.
* neigh_hh_output() is taking care of the race later.
*/
if (!skip_cache &&
(READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
READ_ONCE(hh->hh_len))
return neigh_hh_output(hh, skb);
return n->output(n, skb);
}
根据上面创建邻居的流程可知,此处的n->output调用的是neigh_resolve_output
/* Slow and careful. */
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
int rc = 0;
if (!neigh_event_send(neigh, skb)) {
//...
rc = dev_queue_xmit(skb);
//
}
static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
//...
if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
return __neigh_event_send(neigh, skb);
return 0;
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
if (neigh->dead)
goto out_dead;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
NEIGH_VAR(neigh->parms, APP_PROBES)) {
unsigned long next, now = jiffies;
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh_del_timer(neigh);
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
HZ/100);
neigh_add_timer(neigh, next);
//马上激活arp探测
immediate_probe = true;
} else {
//...
}
} else if (neigh->nud_state & NUD_STALE) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh_del_timer(neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_add_timer(neigh, jiffies +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
//放不下了
while (neigh->arp_queue_len_bytes + skb->truesize >
NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
//丢弃一个
buff = __skb_dequeue(&neigh->arp_queue);
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
//添加到arp队列
__skb_queue_tail(&neigh->arp_queue, skb);
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
if (immediate_probe)
neigh_probe(neigh);
//...
}
如果马上激活,就直接调用 neigh_probe;如果延迟激活,则定时器到了就会触发 neigh_timer_handler,在这里面还是会调用 neigh_probe
static void neigh_probe(struct neighbour *neigh)
__releases(neigh->lock)
{
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
if (neigh->ops->solicit)
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
consume_skb(skb);
}
这里仍然是arp_hh_ops的solicit,即arp_solicit函数
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
//...
arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_hw, dev->dev_addr, NULL, dst);
}
/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
const unsigned char *dest_hw,
const unsigned char *src_hw,
const unsigned char *target_hw,
struct dst_entry *dst)
{
struct sk_buff *skb;
/* arp on this interface. */
if (dev->flags & IFF_NOARP)
return;
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
if (!skb)
return;
skb_dst_set(skb, dst_clone(dst));
arp_xmit(skb);
}
arp_send_dst会创建并发送一个 arp 包
发送完ARP包就调用dev_queue_xmit真正发送到网络设备子系统了
int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
* @sb_dev: suboordinate device used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
skb_reset_mac_header(skb);
//...
txq = netdev_core_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
//...
}
qdisc 全称是 queueing discipline,中文叫排队规则。内核如果需要通过某个网络接口发送数据包,都需要按照为这个接口配置的 qdisc(排队规则)把数据包加入队列。
最简单的 qdisc 是 pfifo,它不对进入的数据包做任何的处理,数据包采用先入先出的方式通过队列。pfifo_fast 稍微复杂一些,它的队列包括三个波段(band)。在每个波段里面,使用先进先出规则。
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
//...
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
qdisc_run_end(q);
}
}
//...
}
void __qdisc_run(struct Qdisc *q)
{
int quota = dev_tx_weight;
int packets;
while (qdisc_restart(q, &packets)) {
quota -= packets;
if (quota <= 0) {
//控制网络包的发送速度,如果超过速度,就需要重新调度,则会调用 __netif_schedul
__netif_schedule(q);
break;
}
}
}
void __netif_schedule(struct Qdisc *q)
{
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
static void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
unsigned long flags;
local_irq_save(flags);
sd = this_cpu_ptr(&softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
static int __init net_dev_init(void)
{
//...
struct softnet_data *sd = &per_cpu(softnet_data, i);
//...
sd->output_queue_tailp = &sd->output_queue;
//...
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
//...
}
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
//...
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
rcu_read_lock();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
//...
qdisc_run(q);
//...
}
qdisc_restart 用于数据的发送。如果网络包发送超过限制,会调用net_tx_action,然后还是调用了 qdisc_run,还是会调用 __qdisc_run,然后调用 qdisc_restart 发送网络包
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
* running seqcount guarantees only one CPU can process
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
* this queue.
*
* netif_tx_lock serializes accesses to device driver.
*
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
* if one is grabbed, another must be free.
*
* Note, that this procedure can be called by a watchdog timer
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*
*/
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
{
spinlock_t *root_lock = NULL;
struct netdev_queue *txq;
struct net_device *dev;
struct sk_buff *skb;
bool validate;
/* Dequeue packet */
skb = dequeue_skb(q, &validate, packets);
if (unlikely(!skb))
return false;
if (!(q->flags & TCQ_F_NOLOCK))
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);
return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
}
/*
* Transmit possibly several skbs, and handle the return status as
* required. Owning running seqcount bit guarantees that
* only one CPU can execute this function.
*
* Returns to the caller:
* false - hardware queue frozen backoff
* true - feel free to send more pkts
*/
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock, bool validate)
{
int ret = NETDEV_TX_BUSY;
bool again = false;
//...
if (likely(skb)) {
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
//...
if (!dev_xmit_complete(ret)) {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely(ret != NETDEV_TX_BUSY))
net_warn_ratelimited("BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
//网卡忙,重新入队列
dev_requeue_skb(skb, q);
return false;
}
return true;
}
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
struct netdev_queue *txq, int *ret)
{
struct sk_buff *skb = first;
int rc = NETDEV_TX_OK;
while (skb) {
struct sk_buff *next = skb->next;
skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
goto out;
}
skb = next;
if (netif_tx_queue_stopped(txq) && skb) {
rc = NETDEV_TX_BUSY;
break;
}
}
//...
}
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
unsigned int len;
int rc;
//处理抓包
if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
return rc;
}
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
const struct net_device_ops *ops = dev->netdev_ops;
netdev_tx_t rc;
rc = __netdev_start_xmit(ops, skb, dev, more);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
}
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
return ops->ndo_start_xmit(skb, dev);
}
报文通过虚拟网卡上送内核协议栈
一般三层交换机的虚接口是通过tun/tap接口实现的,一个vlan包含多个member port,也就是真实的二层口,而一个二层口也可以有多个allowed vlan。
真实的物理口收包后在sdk解包获得vlan id,并通过向tun/tap设备写入来达到上送协议栈的的目的。
内核态在tun_get_user从用户态的缓冲区收包。