Linux 报文从网卡到协议栈之间处理的过程

背景

         作为一个Linux内核开发的程序员,时常会被问到内核收报的处理过程,很多时候可以快速讲出一个大概,但关注的重点多在内核协议栈的报文处理过程,对于报文从到达网卡再到上送协议栈之间的处理过程总是很模糊,因此想就此过程进行学习研究,故有了本篇博客。

收包过程

        网络收报处理的过程分为几个步骤:

        1)硬件接收,网卡通过物理层或者数据链路层接收到数据帧

        2) DMA传输,网卡通过DMA引擎将报文拷贝到ring_buffer缓冲区,并触发硬件中断,通知CPU有报文到来。

        3)CPU硬件中断处理,将报文从ring_buffer中拷贝到内核报文缓冲区skb_buffer中,并放入报文接收队列中。

        4)触发收报软中断NET_RX_ACTION

        5)软中断处理,将报文从接收队列移入处理队列,并上送内核协议栈

        6)内核协议栈报文处理

        这里需要说明的是Linux 内核只负责L2-L4层的内容,L1物理层硬件负责,L4以上应用层负责。

主题分析

        网络收报离不开网卡驱动,以Ubuntu 为例,常用的Intel网卡驱动为e1000模块,这里以e1000为例,介绍网卡收报过程,在此之前需要对驱动加载、网卡探测、请求中断等一系列过程有个了解。

        在Linux 内核中,驱动以模块的形式存在,即驱动实质是一个内核模块,在Linux 内核启动,模块初始化过程中加载注册驱动,函数调用过程如下:

Linux 报文从网卡到协议栈之间处理的过程_第1张图片

         驱动注册过程如下:

static int __init e100_init_module(void)
{
	if (((1 << debug) - 1) & NETIF_MSG_DRV) {
		pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
		pr_info("%s\n", DRV_COPYRIGHT);
	}
	return pci_register_driver(&e100_driver);
}

        由此可知e1000驱动为e100_driver,通过pci_register_driver注册到内核中,e100_driver定义如下:

static struct pci_driver e100_driver = {
	.name =         DRV_NAME, /* 驱动名 */
	.id_table =     e100_id_table,
	.probe =        e100_probe, /* 网卡探测函数 */
	.remove =       e100_remove, /* 网卡移除时调用,主要释放网卡相关资源 */
#ifdef CONFIG_PM
	/* Power Management hooks */
	.suspend =      e100_suspend,
	.resume =       e100_resume,
#endif
	.shutdown =     e100_shutdown, /* 网卡shutdown */
	.err_handler = &e100_err_handler,
};

        pci_register_driver注册的实质就是将对应网卡的驱动挂载在相应的总线下,这样当网卡插入pci总线时,总线启动扫描并遍历下面挂载的所有驱动,依据设备信息(如厂商Id,设备ID信息等)去匹配相应驱动id_table内容,匹配后调用驱动的probe函数为网卡请求中断号,注册中断处理函数等一系列操作。

        从e100_driver的定义可以出,驱动的几个核心函数为:e100_probe、e100_remove、e100_shutdown,这里我们主要介绍和收报有关的e100_probe 函数的处理逻辑。

probe函数

        probe函数处理逻辑如下:

static int e100_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
	struct net_device *netdev;
	struct nic *nic;
	int err;

	/* 申请并分配网络设备结构体变量netdev */
	if (!(netdev = alloc_etherdev(sizeof(struct nic))))
		return -ENOMEM;

	/* 设置网络设备硬件特征 */
	netdev->hw_features |= NETIF_F_RXFCS;
	netdev->priv_flags |= IFF_SUPP_NOFCS;
	netdev->hw_features |= NETIF_F_RXALL;

	/* 设置网络设备操作函数: 如open ,close,tx_xmit, do_ioctl等 */
	netdev->netdev_ops = &e100_netdev_ops;
	SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops); /*设置网络设备配置操作接口 */
	netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;
	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);

	nic = netdev_priv(netdev);
	/* 设置网卡poll函数,用于通过dma从网卡缓冲区拉取报文到ring_buffer */
	netif_napi_add(netdev, &nic->napi, e100_poll, E100_NAPI_WEIGHT);
	nic->netdev = netdev;
	nic->pdev = pdev;
	nic->msg_enable = (1 << debug) - 1;
	nic->mdio_ctrl = mdio_ctrl_hw;
	pci_set_drvdata(pdev, netdev);

	/* 激活PCI设备 */
	if ((err = pci_enable_device(pdev))) {
		netif_err(nic, probe, nic->netdev, "Cannot enable PCI device, aborting\n");
		goto err_out_free_dev;
	}

	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
		netif_err(nic, probe, nic->netdev, "Cannot find proper PCI device base address, aborting\n");
		err = -ENODEV;
		goto err_out_disable_pdev;
	}

	/* 分配I/O内存区域 */
	if ((err = pci_request_regions(pdev, DRV_NAME))) {
		netif_err(nic, probe, nic->netdev, "Cannot obtain PCI resources, aborting\n");
		goto err_out_disable_pdev;
	}

	/* 打上dma_mask   意味值网卡支持dma        */
	if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
		netif_err(nic, probe, nic->netdev, "No usable DMA configuration, aborting\n");
		goto err_out_free_res;
	}

	SET_NETDEV_DEV(netdev, &pdev->dev);

	if (use_io)
		netif_info(nic, probe, nic->netdev, "using i/o access mode\n");

	/* 映射I/O 内存   */
	nic->csr = pci_iomap(pdev, (use_io ? 1 : 0), sizeof(struct csr));
	if (!nic->csr) {
		netif_err(nic, probe, nic->netdev, "Cannot map device registers, aborting\n");
		err = -ENOMEM;
		goto err_out_free_res;
	}

	if (ent->driver_data)
		nic->flags |= ich;
	else
		nic->flags &= ~ich;

	e100_get_defaults(nic);

	/* D100 MAC doesn't allow rx of vlan packets with normal MTU */
	if (nic->mac < mac_82558_D101_A4)
		netdev->features |= NETIF_F_VLAN_CHALLENGED;

	/* locks must be initialized before calling hw_reset */
	spin_lock_init(&nic->cb_lock);
	spin_lock_init(&nic->cmd_lock);
	spin_lock_init(&nic->mdio_lock);

	/* Reset the device before pci_set_master() in case device is in some
	 * funky state and has an interrupt pending - hint: we don't have the
	 * interrupt handler registered yet. */
	 /* 硬件复位 */
	e100_hw_reset(nic);

	pci_set_master(pdev);

	init_timer(&nic->watchdog);
	nic->watchdog.function = e100_watchdog;
	nic->watchdog.data = (unsigned long)nic;

	INIT_WORK(&nic->tx_timeout_task, e100_tx_timeout_task);

	if ((err = e100_alloc(nic))) {
		netif_err(nic, probe, nic->netdev, "Cannot alloc driver memory, aborting\n");
		goto err_out_iounmap;
	}

	if ((err = e100_eeprom_load(nic)))
		goto err_out_free;

	/* phy 寄存器复位 */
	e100_phy_init(nic);

	memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);
	if (!is_valid_ether_addr(netdev->dev_addr)) {
		if (!eeprom_bad_csum_allow) {
			netif_err(nic, probe, nic->netdev, "Invalid MAC address from EEPROM, aborting\n");
			err = -EAGAIN;
			goto err_out_free;
		} else {
			netif_err(nic, probe, nic->netdev, "Invalid MAC address from EEPROM, you MUST configure one.\n");
		}
	}

	/* Wol magic packet can be enabled from eeprom */
	if ((nic->mac >= mac_82558_D101_A4) &&
	   (nic->eeprom[eeprom_id] & eeprom_id_wol)) {
		nic->flags |= wol_magic;
		device_set_wakeup_enable(&pdev->dev, true);
	}

	/* ack any pending wake events, disable PME */
	pci_pme_active(pdev, false);

	strcpy(netdev->name, "eth%d");
	/* 网络设备注册 */
	if ((err = register_netdev(netdev))) {
		netif_err(nic, probe, nic->netdev, "Cannot register net device, aborting\n");
		goto err_out_free;
	}
	/* 创建dma 缓冲池 */
	nic->cbs_pool = pci_pool_create(netdev->name,
			   nic->pdev,
			   nic->params.cbs.max * sizeof(struct cb),
			   sizeof(u32),
			   0);
	netif_info(nic, probe, nic->netdev,
		   "addr 0x%llx, irq %d, MAC addr %pM\n",
		   (unsigned long long)pci_resource_start(pdev, use_io ? 1 : 0),
		   pdev->irq, netdev->dev_addr);

	return 0;

err_out_free:
	e100_free(nic);
err_out_iounmap:
	pci_iounmap(pdev, nic->csr);
err_out_free_res:
	pci_release_regions(pdev);
err_out_disable_pdev:
	pci_disable_device(pdev);
err_out_free_dev:
	pci_set_drvdata(pdev, NULL);
	free_netdev(netdev);
	return err;
}

        通过上述代码可以看出里probe函数主要是分配、初始化、注册网络设备,分配并映射I/O缓冲区,为网卡收发包作准备,需要特别说明的是网卡的epoll函数也是在网卡设备注册时指定的,此后当报文到达网卡,软中断处理时可以依据poll函数拉取报文;网络设备操作函数 e100_netdev_ops,定义如下:

static const struct net_device_ops e100_netdev_ops = {
	.ndo_open		= e100_open,
	.ndo_stop		= e100_close,
	.ndo_start_xmit		= e100_xmit_frame,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_rx_mode	= e100_set_multicast_list,
	.ndo_set_mac_address	= e100_set_mac_address,
	.ndo_change_mtu		= e100_change_mtu,
	.ndo_do_ioctl		= e100_do_ioctl,
	.ndo_tx_timeout		= e100_tx_timeout,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= e100_netpoll,
#endif
	.ndo_set_features	= e100_set_features,
};

        这里介绍下e100_open,因为我们关注的关于网卡的中断号请求,中断处理函数都在这里。

e100_open

         e100_open 核心逻辑是调用了e100_up,因此我们重点关注下e100_up的处理。

static int e100_up(struct nic *nic)
{
	int err;

	/* 分配报文接收链表 */
	if ((err = e100_rx_alloc_list(nic)))
		return err;
	/* 分配初始化相应的dma 地址 */
	if ((err = e100_alloc_cbs(nic)))
		goto err_rx_clean_list;
	/* 硬件初始化 */
	if ((err = e100_hw_init(nic)))
		goto err_clean_cbs;
	e100_set_multicast_list(nic->netdev);
	/* 开始接收 */
	e100_start_receiver(nic, NULL);
	mod_timer(&nic->watchdog, jiffies);
	/* 请求中断号,并注册中断处理函数 */
	if ((err = request_irq(nic->pdev->irq, e100_intr, IRQF_SHARED,
		nic->netdev->name, nic->netdev)))
		goto err_no_irq;
	/* 唤醒传输队列 */
	netif_wake_queue(nic->netdev);
	/* 使能 napi调度 */
	napi_enable(&nic->napi);
	/* enable ints _after_ enabling poll, preventing a race between
	 * disable ints+schedule */
	/* 开启中断 */
	e100_enable_irq(nic);
	return 0;

err_no_irq:
	del_timer_sync(&nic->watchdog);
err_clean_cbs:
	e100_clean_cbs(nic);
err_rx_clean_list:
	e100_rx_clean_list(nic);
	return err;
}

        由此可见 e100网卡的中断处理函数为e100_intr,硬件中断处理是网卡收包到上送协议栈之间的关键,也是本博客的重点。

硬中断处理

static irqreturn_t e100_intr(int irq, void *dev_id)
{
	struct net_device *netdev = dev_id;
	struct nic *nic = netdev_priv(netdev);
	u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);

	netif_printk(nic, intr, KERN_DEBUG, nic->netdev,
		     "stat_ack = 0x%02X\n", stat_ack);

	/* 中断确认 */
	if (stat_ack == stat_ack_not_ours ||	/* Not our interrupt */
	   stat_ack == stat_ack_not_present)	/* Hardware is ejected */
		return IRQ_NONE;

	/* Ack interrupt(s) */
	iowrite8(stat_ack, &nic->csr->scb.stat_ack); /* 中断状态回写*/

	/* We hit Receive No Resource (RNR); restart RU after cleaning */
	if (stat_ack & stat_ack_rnr) /* 中断状态判断是否需要延迟接收 */
		nic->ru_running = RU_SUSPENDED;

	/* 检查napi是否可调度,若是,关硬件中断,napi调度    */
	if (likely(napi_schedule_prep(&nic->napi))) {
		/* 硬件关中断 */
		e100_disable_irq(nic);

		/* napi 调度,目的:将网卡poll函数挂载到当前cpu待执行的poll链表中,并触发软中断 */
		__napi_schedule(&nic->napi);
	}

	return IRQ_HANDLED;
}

        由此可见,e100_intr主要做了如下几件事:

        1)中断状态确认,并回写。

        2)napi 调度,将网卡poll函数挂载到当前cpu待执行的poll链表中,并触发软中断__raise_softirq_irqoff(NET_RX_SOFTIRQ);

软中断处理

        收包软中断对应NET_RX_SOFTIRQ,处理函数注册如下:

open_softirq(NET_RX_SOFTIRQ, net_rx_action);

可见NET_RX_SOFTIRQ对应的处理函数为net_rx_action。

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
	int budget = READ_ONCE(netdev_budget);
	LIST_HEAD(list);
	LIST_HEAD(repoll);

start:
	sd->in_net_rx_action = true; /* 设置软中断收包标志,
		在触发软中断前首先判断该标志,如果正在收包则无需再触发软中断,
		poll函数会轮询处理所有待处理的报文 */
	local_irq_disable();
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n;

		skb_defer_free_flush(sd);

		if (list_empty(&list)) {
			if (list_empty(&repoll)) {
				sd->in_net_rx_action = false;
				barrier();
				/* We need to check if ____napi_schedule()
				 * had refilled poll_list while
				 * sd->in_net_rx_action was true.
				 */
				if (!list_empty(&sd->poll_list))
					goto start;
				if (!sd_has_rps_ipi_waiting(sd))
					goto end;
			}
			break;
		}

		/* 轮询sd->poll_list上的所有napi设备 */
		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll); /* 调用poll函数从网卡驱动中读取一定数量的skb */

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list)) /* 如果poll list不空,表示还有skb未接收完,继续触发软中断 */
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	else
		sd->in_net_rx_action = false;

	/* 开启本地硬中断,*/
	net_rps_action_and_irq_enable(sd);
end:;
}

         软中断处理的核心是调用napi_poll函数轮询拉取报文,并上送协议栈。具体逻辑如下:

static int __napi_poll(struct napi_struct *n, bool *repoll)
{
	int work, weight;

	weight = n->weight; /* 表示一次软中断可以处理报文的个数最大数值 */

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	/* 判断napi可调度状态位,若NAPI_STATE_SCHED置位则调用对应的poll函数,对于napi设备对应
		网卡提供的poll函数,以e100_poll为例,对应e100_poll,非NAPI函数对应process_backlog */
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight);
		trace_napi_poll(n, work, weight);
	}

	if (unlikely(work > weight))
		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
				n->poll, work, weight);

	if (likely(work < weight))
		return work;

	/* Drivers must not modify the NAPI state if they
	 * consume the entire weight.  In such cases this code
	 * still "owns" the NAPI instance and therefore can
	 * move the instance around on the list at-will.
	 */
	if (unlikely(napi_disable_pending(n))) {
		napi_complete(n);
		return work;
	}

	/* The NAPI context has more processing work, but busy-polling
	 * is preferred. Exit early.
	 */
	if (napi_prefer_busy_poll(n)) {
		if (napi_complete_done(n, work)) {
			/* If timeout is not set, we need to make sure
			 * that the NAPI is re-scheduled.
			 */
			napi_schedule(n);
		}
		return work;
	}

	if (n->gro_bitmask) {
		/* flush too old packets
		 * If HZ < 1000, flush all packets.
		 */
		napi_gro_flush(n, HZ >= 1000);
	}

	/* 将接收报文上送协议栈 */
	gro_normal_list(n);

	/* Some drivers may have called napi_schedule
	 * prior to exhausting their budget.
	 */
	if (unlikely(!list_empty(&n->poll_list))) {
		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
			     n->dev ? n->dev->name : "backlog");
		return work;
	}

	*repoll = true;

	return work;
}

        e100对应处理函数为e100_poll,处理逻辑如下:

static int e100_poll(struct napi_struct *napi, int budget)
{
	struct nic *nic = container_of(napi, struct nic, napi);
	unsigned int work_done = 0;

	e100_rx_clean(nic, &work_done, budget);
	e100_tx_clean(nic);

	/* If budget fully consumed, continue polling */
	if (work_done == budget)
		return budget;

	/* only re-enable interrupt if stack agrees polling is really done */
	if (likely(napi_complete_done(napi, work_done)))
		e100_enable_irq(nic);

	return work_done;
}

        其中和收包相关的处理为:e100_rx_clean,通过dma技术将报文直接从网卡内存缓冲区拉取报文到网卡的rx->list(事先分配好的内存区域,即常说的ring_buffer)。

        报文上送协议栈函数调用过程如下:

Linux 报文从网卡到协议栈之间处理的过程_第2张图片

        至此,网卡收包的处理过程已基本结束,很多地方可能不够细节,待后续完善补充,希望能和大家一起交流学习。

你可能感兴趣的:(linux,服务器,网络)