linux | 网络数据包softirq 软中断与CPU

2023-03-18 17:46:33 浏览数 (1)

Linux 在每个 CPU 上会创建一个 ksoftirqd 内核线程。

softirqs 是在 Linux 内核编译时就确定好的,例如网络收包对应的 NET_RX_SOFTIRQ 软中断。因此是一种静态机制。如果想加一种新 softirq 类型,就需要修改并重新编译内核。

代码语言:javascript复制
heidsoft@heidsoft-dev:~$ systemd-cgls -k | grep kworker
├─     8 [kworker/0:0H-events_highpri]
├─    24 [kworker/1:0H-events_highpri]
├─    91 [kworker/0:1H-kblockd]
├─   126 [kworker/u65:0]
├─   155 [kworker/1:1H-kblockd]
├─164504 [kworker/1:3-events]
├─166487 [kworker/u64:2-events_unbound]
├─166610 [kworker/0:1-events]
├─167421 [kworker/u64:0-events_unbound]
├─167437 [kworker/0:0-events]
├─167685 [kworker/1:1-events]
├─168449 [kworker/u64:1]
├─168597 [kworker/0:2-kec_query]
代码语言:javascript复制
heidsoft@heidsoft-dev:~$ cat /proc/interrupts
           CPU0       CPU1       
  0:          7          0   IO-APIC   2-edge      timer
  1:         37          0   IO-APIC   1-edge      i8042
  7:          0          0   IO-APIC   7-edge      parport0
  8:          0          1   IO-APIC   8-edge      rtc0
  9:          0     176583   IO-APIC   9-fasteoi   acpi
 12:          0        144   IO-APIC  12-edge      i8042
 14:          0          0   IO-APIC  14-edge      ata_piix
 15:          0          0   IO-APIC  15-edge      ata_piix
 18:      38262      10860   IO-APIC  18-fasteoi   uhci_hcd:usb2, ioc0
 19:        136         61   IO-APIC  19-fasteoi   ehci_hcd:usb1
 22:          0          0   IO-APIC  22-fasteoi   virtio1
 24:       4520          0   PCI-MSI 49152-edge      prl_tg
 25:        201        109   PCI-MSI 81920-edge      virtio0-config
 26:      20479     411330   PCI-MSI 81921-edge      virtio0-input.0
 27:     471647      51205   PCI-MSI 81922-edge      virtio0-output.0
 28:         32          0   PCI-MSI 487424-edge      xhci_hcd
 29:      17040      15240   PCI-MSI 512000-edge      ahci[0000:00:1f.2]
 30:          0          1   PCI-MSI 524288-edge      virtio2-config
 31:       1131       1433   PCI-MSI 524289-edge      virtio2-virtqueues
 32:          0       4734   PCI-MSI 516096-edge      snd_hda_intel:card0
NMI:          0          0   Non-maskable interrupts
LOC:    2110890    1998553   Local timer interrupts
SPU:          0          0   Spurious interrupts
PMI:          0          0   Performance monitoring interrupts
IWI:          0          0   IRQ work interrupts
RTR:          0          0   APIC ICR read retries
RES:      94259      90183   Rescheduling interrupts
CAL:    1563802    1451446   Function call interrupts
TLB:       1119       1363   TLB shootdowns
TRM:          0          0   Thermal event interrupts
THR:          0          0   Threshold APIC interrupts
DFR:          0          0   Deferred Error APIC interrupts
MCE:          0          0   Machine check exceptions
MCP:        210        210   Machine check polls
ERR:          0
MIS:          0
PIN:          0          0   Posted-interrupt notification event
NPI:          0          0   Nested posted-interrupt event
PIW:          0          0   Posted-interrupt wakeup event
heidsoft@heidsoft-dev:~$

/research/linux-5.15.4/net/core/dev.c

注册网卡收发包(RX/TX)软中断处理函数

open_softirq(NET_TX_SOFTIRQ, net_tx_action);

open_softirq(NET_RX_SOFTIRQ, net_rx_action);

代码语言:javascript复制
static int __init net_dev_init(void)
{
  int i, rc = -ENOMEM;

  BUG_ON(!dev_boot_phase);

  if (dev_proc_init())
    goto out;

  if (netdev_kobject_init())
    goto out;

  INIT_LIST_HEAD(&ptype_all);
  for (i = 0; i < PTYPE_HASH_SIZE; i  )
    INIT_LIST_HEAD(&ptype_base[i]);

  INIT_LIST_HEAD(&offload_base);

  if (register_pernet_subsys(&netdev_net_ops))
    goto out;

  /*
   *  Initialise the packet receive queues.
   */

  for_each_possible_cpu(i) {
    struct work_struct *flush = per_cpu_ptr(&flush_works, i);
    struct softnet_data *sd = &per_cpu(softnet_data, i);

    INIT_WORK(flush, flush_backlog);

    skb_queue_head_init(&sd->input_pkt_queue);
    skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
    skb_queue_head_init(&sd->xfrm_backlog);
#endif
    INIT_LIST_HEAD(&sd->poll_list);
    sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
    INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
    sd->cpu = i;
#endif

    init_gro_hash(&sd->backlog);
    sd->backlog.poll = process_backlog;
    sd->backlog.weight = weight_p;
  }

  dev_boot_phase = 0;

  /* The loopback device is special if any other network devices
   * is present in a network namespace the loopback device must
   * be present. Since we now dynamically allocate and free the
   * loopback device ensure this invariant is maintained by
   * keeping the loopback device as the first device on the
   * list of network devices.  Ensuring the loopback devices
   * is the first device that appears and the last network device
   * that disappears.
   */
  if (register_pernet_device(&loopback_net_ops))
    goto out;

  if (register_pernet_device(&default_device_ops))
    goto out;

  open_softirq(NET_TX_SOFTIRQ, net_tx_action);
  open_softirq(NET_RX_SOFTIRQ, net_rx_action);

  rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
               NULL, dev_cpu_dead);
  WARN_ON(rc < 0);
  rc = 0;
out:
  return rc;
}
代码语言:javascript复制
heidsoft@heidsoft-dev:~$ sudo perf record -a 
>         -e irq:irq_handler_entry,irq:irq_handler_exit 
>         -e irq:softirq_entry --filter="vec == 3" 
>         -e irq:softirq_exit --filter="vec == 3"  
>         -e napi:napi_poll 
>         -- sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.725 MB perf.data (20 samples) ]
heidsoft@heidsoft-dev:~$ sudo perf script
         swapper     0 [000] 67144.378134: irq:irq_handler_entry: irq=18 name=uhci_hcd:usb2
         swapper     0 [000] 67144.378145:  irq:irq_handler_exit: irq=18 ret=unhandled
         swapper     0 [000] 67144.378146: irq:irq_handler_entry: irq=18 name=ioc0
         swapper     0 [000] 67144.378167:  irq:irq_handler_exit: irq=18 ret=handled
         swapper     0 [000] 67144.378685: irq:irq_handler_entry: irq=18 name=uhci_hcd:usb2
         swapper     0 [000] 67144.378692:  irq:irq_handler_exit: irq=18 ret=unhandled
         swapper     0 [000] 67144.378692: irq:irq_handler_entry: irq=18 name=ioc0
         swapper     0 [000] 67144.378710:  irq:irq_handler_exit: irq=18 ret=handled
         swapper     0 [001] 67144.512867: irq:irq_handler_entry: irq=26 name=virtio0-input.0
         swapper     0 [001] 67144.512870:  irq:irq_handler_exit: irq=26 ret=handled
         swapper     0 [001] 67144.512873:     irq:softirq_entry: vec=3 [action=NET_RX]
         swapper     0 [001] 67144.514142: irq:irq_handler_entry: irq=27 name=virtio0-output.0
         swapper     0 [001] 67144.514144:  irq:irq_handler_exit: irq=27 ret=handled
         swapper     0 [001] 67144.514146:        napi:napi_poll: napi poll on napi struct 0xffff8dfa>
         swapper     0 [001] 67144.514147:      irq:softirq_exit: vec=3 [action=NET_RX]
         swapper     0 [001] 67144.514148:     irq:softirq_entry: vec=3 [action=NET_RX]
         swapper     0 [001] 67144.514150:        napi:napi_poll: napi poll on napi struct 0xffff8dfa>
         swapper     0 [001] 67144.514151:      irq:softirq_exit: vec=3 [action=NET_RX]
         swapper     0 [000] 67144.519984: irq:irq_handler_entry: irq=29 name=ahci[0000:00:1f.2]
         swapper     0 [000] 67144.520006:  irq:irq_handler_exit: irq=29 ret=handled

NAPI, or New API, was written to make processing packets of incoming cards more efficient. Hard interrupts are expensive because they cannot be interrupted. Even with interrupt coalescence (described later in more detail), the interrupt handler will monopolize a CPU core completely. The design of NAPI allows the driver to go into a polling mode instead of being hard-interrupted for every required packet receive. Under normal operation, an initial hard interrupt or IRQ is raised, followed by a SoftIRQ handler which polls the card using NAPI routines. The polling routine has a budget which determines the CPU time the code is allowed. This is required to prevent SoftIRQs from monopolizing the CPU. On completion, the kernel will exit the polling routine and re-arm, then the entire procedure will repeat itself.

NAPI 或新 API 的编写是为了更有效地处理传入卡的数据包。硬中断是昂贵的,因为它们不能被中断。即使有中断 合并(稍后详细描述),中断处理程序将独占一个 CPU 内核 完全地。NAPI 的设计允许驱动程序进入轮询模式而不是被 为每个需要的数据包接收硬中断。在正常操作下,会引发初始硬中断或 IRQ,然后是 SoftIRQ 处理程序 它使用 NAPI 例程轮询卡。轮询例程有一个预算,它决定了 允许代码的 CPU 时间。这是防止 SoftIRQ 独占 CPU 所必需的。完成后,内核将退出轮询例程并重新启动,然后整个过程将 重复自己。

If the SoftIRQs do not run for long enough, the rate of incoming data could exceed the kernel's capability to drain the buffer fast enough. As a result, the NIC buffers will overflow and traffic will be lost. Occasionally, it is necessary to increase the time that SoftIRQs are allowed to run on the CPU. This is known as the netdev_budget. The default value of the budget is 300. This will cause the SoftIRQ process to drain 300 messages from the NIC before getting off the CPU:

如果 SoftIRQ 运行时间不够长,传入数据的速率可能会超过内核的速率 足够快地耗尽缓冲区的能力。结果,NIC 缓冲区将溢出并且流量将 迷路了。有时,需要增加允许 SoftIRQ 运行的时间 中央处理器。这称为 netdev_budget。预算的默认值为 300。这将 导致 SoftIRQ 进程在离开 CPU 之前从 NIC 排出 300 条消息:

代码语言:javascript复制
heidsoft@heidsoft-dev:~$ sysctl net.core.netdev_budget 
net.core.netdev_budget = 300
heidsoft@heidsoft-dev:~$
  • http://arthurchiao.art/blog/linux-irq-softirq-zh/#1-什么是中断
  • https://novoland.github.io/网络/2014/07/26/网卡中断负载均衡.html
  • https://applezulab.netdpi.net/linux-prog/about-linux-smp_affinity
  • https://web.archive.org/web/20200225050436/http://blog.yufeng.info/archives/2422
  • https://github.com/coreos/bugs/issues/2135
  • https://kernelbook.sourceforge.net/kernel-hacking.html/ioctls.html
  • https://unix.stackexchange.com/questions/341947/what-is-the-difference-between-proc-interrupts-and-proc-softirq-in-linux
  • https://people.kernel.org/dsahern/the-cpu-cost-of-networking-on-a-host

0 人点赞