tracepoint类型的ebpf程序是如何被执行的

2024-01-04 21:37:44 浏览数 (2)

本文基于libbpf实现的ebpf例子介绍tracepoint类型ebpf程序调用流程,内核实现以5.4版本为例进行介绍。

一. 基于libbpf实现一个跟踪kfree_skb的tracepoint类型ebpf示例:

libbpf/bpftool项目地址:https://github.com/libbpf/libbpf,libbpf提供了一些加载bpf程序的方法,封装了内核提供的bpf()系统调用,帮助我们省去了大部分关于bpf本身的逻辑。

1. 环境准备

内核支持CONFIG_DEBUG_INFO_BTF且安装相关的工具包:

代码语言:javascript复制
zgrep "CONFIG_DEBUG_INFO_BTF=y" /proc/config.gz //检查内核支持CONFIG_DEBUG_INFO_BTF
yum install bpftool libbpf-devel llvm-libs  llvm-devel llvm -y

下载编译libbpf:

代码语言:javascript复制

git clone https://github.com/libbpf/libbpf.git
cd libpf/src
make

编译后生成libbpf.so libbpf.so.1 libbpf.so.1.3.0 libbpf.a库文件:
# pwd
/root/ebpf-example/libbpf/src
# ls
bpf.c               bpf_helper_defs.h  btf_dump.c    hashmap.h        libbpf.h           libbpf.pc.template  libbpf_version.h  nlattr.h     skel_internal.h  strset.h
bpf_core_read.h     bpf_helpers.h      btf.h         libbpf.a         libbpf_internal.h  libbpf_probes.c     linker.c          relo_core.c  staticobjs       usdt.bpf.h
bpf_endian.h        bpf_prog_linfo.c   elf.c         libbpf.c         libbpf_legacy.h    libbpf.so           Makefile          relo_core.h  str_error.c      usdt.c
bpf_gen_internal.h  bpf_tracing.h      gen_loader.c  libbpf_common.h  libbpf.map         libbpf.so.1         netlink.c         ringbuf.c    str_error.h      zip.c
bpf.h               btf.c              hashmap.c     libbpf_errno.c   libbpf.pc          libbpf.so.1.3.0     nlattr.c          sharedobjs   strset.c         zip.h

后面程序使用的是<bpf/libbpf.h>,需要在/root/ebpf-example/libbpf/src创建bpf子目录,并将头文件拷贝
到bpf目录下,不然会直接使用系统自带的libpf头文件。 当然也可以存放到其他路径下,编译时-I -L参数指定下头文件和库相应路径。
#mkdir bpf
# cp *.h bpf
# ls
bpf_core_read.h  bpf_gen_internal.h  bpf_helper_defs.h  bpf_tracing.h  hashmap.h        libbpf.h           libbpf_legacy.h   nlattr.h     skel_internal.h  strset.h    zip.h
bpf_endian.h     bpf.h               bpf_helpers.h      btf.h          libbpf_common.h  libbpf_internal.h  libbpf_version.h  relo_core.h  str_error.h      usdt.bpf.h


编译libbpf时也可以通过环境变量指定编译后库文件和头文件的安装路径:
#cd libpf
#make -C src/ clean
#BUILD_STATIC_ONLY=y OBJDIR=./ INCLUDEDIR=./ make -C src/ install

编译安装后libbpf.a在libpf/src目录下:
#cd libpf
# find | grep libbpf.a
./src/libbpf.a
头文件在libpf/src和libpf/src/bpf下
# find | grep -w bpf.h
./include/uapi/linux/bpf.h
./src/bpf/usdt.bpf.h
./src/bpf/bpf.h
./src/usdt.bpf.h
./src/bpf.h

2. 构建并编译epbf程序在内核态执行的代码:

代码语言:javascript复制
//bpftool读取 vmlinux 文件并生成对应的 vmlinux.h 头文件。
bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h

/*以kfree_skb为例,内核使用TRACE_EVENT定义了名为kfree_skb的tracepoint,
在eBP程序里我们需要获取tracepoint的参数的话,需要按照同样的结构体格式
来访问entry里的数据。可以通过tracepoint的format来查看entry的数据结构.*/
# cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
name: kfree_skb
ID: 1409
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:void * skbaddr;   offset:8;       size:8; signed:0;
        field:void * location;  offset:16;      size:8; signed:0;
        field:unsigned short protocol;  offset:24;      size:2; signed:0;

print fmt: "skbaddr=%p protocol=%u location=%p", REC->skbaddr, REC->protocol, REC->location




/*tracepoint的format输出信息描述的是entry结构体,tracepoint的前8个字节都是
统一记录struct trace_entry的信息,struct trace_entry占用8个字节,
因此8个字节后的才是每个tracepoint自定义的参数
struct trace_entry {
    unsigned short type;
    unsigned char flags;
    unsigned char preempt_count;
    int pid;
}
*/

/*
TRACE_EVENT(kfree_skb,......) 中TP_STRUCT__entry定义的才是kfree_skb的tracepoint能获取的参数。
TP_STRUCT__entry(
                __field(void *,         skbaddr)
                __field(void *,         location)
                __field(unsigned short, protocol)
                __field(enum skb_drop_reason,   reason)
        ),
*/
//epbf程序内核态要执行的代码:
# cat trace_kfree_skb.bpf.c

#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
struct args_kfree_skb {//根据tracing/events/skb/kfree_skb/format输出格式定义
        void *regs;//对应struct trace_entry
        void *skbaddr;
        void *location;
        unsigned short protocol;
};
SEC("tracepoint/skb/kfree_skb")
int tp_kfree_skb(struct args_kfree_skb *args)
{
        bpf_printk("hello test bpf :skbaddr:%llx,location:%llxn", args->skbaddr,args->location);
        return 1;/*这里返回值如果为0会导致当执行perf trace -e skb:kfree_skb 无法获取到信息,
                  后面会在介绍内核perf_trace_run_bpf_submit()函数时说明原因*/
}
char LICENSE[] SEC("license") = "GPL";

//编译trace_kfree_skb.bpf.c, /root/ebpf-example/libbpf/src为前面libbpf代码所在路径:
clang -g -O3 -target bpf -D__TARGET_ARCH_x86 -I /root/ebpf-example/libbpf/src -c trace_kfree_skb.bpf.c -o trace_kfree_skb.bpf.o
或者
clang -g -O3 -target bpf -D__TARGET_ARCH_x86_64 -I /root/ebpf-example/libbpf/src -c trace_kfree_skb.bpf.c -o trace_kfree_skb.bpf.o

3. 构建并编译epbf程序在用户态执行的代码:

代码语言:javascript复制
/*基于前面ebpf程序编译出来的trace_kfree_skb.bpf.o构建skeleton头文件,ebpf例子程序的
用户态程序会调用头文件里定义的函数来调用libpf提供的接口将ebpf例子程序的内核部分代码加载
到内核并关联到我们要trace的tracepoint上
生成的 BPF skeleton 有相应的函数来实现每个阶段的触发:
<name>__open() – 创建并打开 BPF 应用程序;
<name>__load() – 实例化、加载和验证 BPF 应用程序部分;
<name>__attach() – 附加所有可自动附加的 BPF 程序(它是可选的,你可以通过直接使用 libbpf API 获得更多控制);
<name>__destroy() – 分离所有BPF 程序并释放所有使用的资源。
*/
bpftool gen skeleton trace_kfree_skb.bpf.o > trace_kfree_skb.skel.h



//epbf程序用户态要执行的代码:
#include <stdio.h>
#include <unistd.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include "trace_kfree_skb.skel.h"

static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
        return vfprintf(stderr, format, args);
}

int main(int argc, char **argv)
{
        struct trace_kfree_skb_bpf *skel;
        int err;
         
         struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
                perror("setrlimit(RLIMIT_MEMLOCK)");
                return 1;
        }

         libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
        /* Set up libbpf errors and debug info callback */
         libbpf_set_print(libbpf_print_fn);

        /* Open BPF application */
        skel = trace_kfree_skb_bpf__open();
        if (!skel) {
                fprintf(stderr, "Failed to open BPF skeletonn");
                return 1;
        }

        /* Load & verify BPF programs */
        err = trace_kfree_skb_bpf__load(skel);
        if (err) {
                fprintf(stderr, "Failed to load and verify BPF skeletonn");
                goto cleanup;
        }
        /* Attach tracepoint handler */
         err = trace_kfree_skb_bpf__attach(skel);
         if (err) {
                 fprintf(stderr, "Failed to attach BPF skeletonn");
                 goto cleanup;
         }

        printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` or run 'bpftool prog tracelog'"
               "to see output of the BPF programs.n");

       for (;;) {
                sleep(60);
        }

cleanup:
        trace_kfree_skb_bpf__destroy(skel);
        return 0;
}

/*编译trace_kfree_skb.c, /root/ebpf-example/libbpf/src为前面
libbpf代码以及libbpf的库文件所在路径,使用静态库libbpf.a:*/
clang -g -O2 trace_kfree_skb.c -I /root/ebpf-example/libbpf/src -L /root/ebpf-example/libbpf/src -l:libbpf.a -lelf -lz -o trace_kfree_skb

4. 执行构建的ebpf程序:

代码语言:javascript复制

//执行ebpf程序
./trace_kfree_skb

/*可通过bpftool prog tracelog或者cat /sys/kernel/debug/tracing/trace_pipe查看
bpf_printk输出信息,使用cat /sys/kernel/debug/tracing/trace_pipe查看时需要
确保/sys/kernel/debug/tracing/tracing_on是打开的。*/
# bpftool prog tracelog
              sh-2740707 [001] .... 1566460.533201: 0: hello test bpf :skbaddr:ffff88806044f000
              sh-2740707 [001] .... 1566460.533208: 0: hello test bpf :skbaddr:ffff88806044f000
              ......
              ......
              
              

//bpftool查看加载的ebpf程序信息        
# bpftool prog show 
450: tracepoint  name tp_kfree_skb  tag 9e46ea8015d2c45f  gpl
        loaded_at 2023-09-15T21:27:51 0800  uid 0
        xlated 200B  jited 143B  memlock 4096B
        btf_id 322
                            
# bpftool prog show id 450
450: tracepoint  name tp_kfree_skb  tag 9e46ea8015d2c45f  gpl
        loaded_at 2023-09-15T21:27:51 0800  uid 0
        xlated 200B  jited 143B  memlock 4096B
        btf_id 322



# bpftool btf dump id 322
450: tracepoint  name tp_kfree_skb  tag 9e46ea8015d2c45f  gpl
        loaded_at 2023-09-15T21:27:51 0800  uid 0
        xlated 200B  jited 143B  memlock 4096B
        btf_id 322
[root@VM-2-48-tencentos ebpf-example]# bpftool btf dump id 322
[1] PTR '(anon)' type_id=2
[2] STRUCT 'args_kfree_skb' size=32 vlen=4
        'regs' type_id=3 bits_offset=0
        'skbaddr' type_id=3 bits_offset=64
        'location' type_id=3 bits_offset=128
        'protocol' type_id=4 bits_offset=192
[3] PTR '(anon)' type_id=0
[4] INT 'unsigned short' size=2 bits_offset=0 nr_bits=16 encoding=(none)
[5] FUNC_PROTO '(anon)' ret_type_id=6 vlen=1
        'args' type_id=1
[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
[7] FUNC 'tp_kfree_skb' type_id=5 linkage=global
[8] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[9] ARRAY '(anon)' type_id=8 index_type_id=10 nr_elems=4
[10] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none)
[11] VAR 'LICENSE' type_id=9, linkage=global-alloc
[12] DATASEC 'license' size=4 vlen=1
        type_id=11 offset=0 size=4
 

            
# bpftool btf dump id 307 format c
#ifndef __VMLINUX_H__
#define __VMLINUX_H__

#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
#endif

struct args_kfree_skb {
        void *regs;
        void *skbaddr;
        void *location;
        unsigned short protocol;
};

#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute pop
#endif

#endif /* __VMLINUX_H__ */            
                                                

二. ebpf程序被调用执行的流程

从前面的例子在用户态执行的代码trace_kfree_skb.c可以知道使用libbpf来开发ebpf程序时用户态执行的程序主要通过调用前面通过“bpftool gen skeleton trace_kfree_skb.bpf.o > trace_kfree_skb.skel.h”生成的trace_kfree_skb.skel.h提供的函数来实现解析eppf程序obj文件的elf section、加载object文件到内核以及把程序挂到事件的钩子上。

下面基于前面的例子介绍下这三个步骤主要完成的工作:

1. trace_kfree_skb_bpf__open加载ojbect文件trace_kfree_skb.bpf.o并处理elf section信息:

代码语言:javascript复制
//trace_kfree_skb.skel.h
tatic inline struct trace_kfree_skb_bpf *
trace_kfree_skb_bpf__open_opts(const struct bpf_object_open_opts *opts)
{
        struct trace_kfree_skb_bpf *obj;

        obj = (struct trace_kfree_skb_bpf *)calloc(1, sizeof(*obj));
        if (!obj)
                return NULL;
        if (trace_kfree_skb_bpf__create_skeleton(obj))
                goto err;
        if (bpf_object__open_skeleton(obj->skeleton, opts))
                goto err;

        return obj;
err:
        trace_kfree_skb_bpf__destroy(obj);
        return NULL;
}
//s->name指定的是当epbf程序SEC()指定的监听事件被执行时我们的程序要执行的函数名。
//s->data_sz 和s->data分别对应的是ebpf程序的object文件trace_kfree_skb.bpf.o大小和文件数据。
static inline int
trace_kfree_skb_bpf__create_skeleton(struct trace_kfree_skb_bpf *obj)
{
        struct bpf_object_skeleton *s;

        s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
        if (!s)
                return -1;
        obj->skeleton = s;

        s->sz = sizeof(*s);
        s->name = "trace_kfree_skb_bpf";
        s->obj = &obj->obj;

        /* programs */
        s->prog_cnt = 1;
        s->prog_skel_sz = sizeof(*s->progs);
        s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
        if (!s->progs)
                goto err;

        s->progs[0].name = "tp_kfree_skb";
        s->progs[0].prog = &obj->progs.trace_kfree_skb;
        s->progs[0].link = &obj->links.trace_kfree_skb;

        s->data_sz = 4808;
        s->data = (void *)"
x7fx45x4cx46x02x01x01x01xf7x01
x48x0dx40x40x16
x01xb7x02x6cx78x0ax63x2axf8xffx18x02x61x74
x69x6fx6ex3ax25x6cx7bx2axf0xffx18x02x25x6c
x6cx78x2cx6cx6fx63x7bx2axe8xffx18x02x73x6b
.......
.......;
        return 0;
err:
        bpf_object__destroy_skeleton(s);
        return -1;
}

//bpf_object__open_skeleton是libbpf提供的库函数:
int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
                              const struct bpf_object_open_opts *opts)
  -->bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
                     const struct bpf_object_open_opts *opts)//obj_buf和obj_buf_sz传递的参数是object文件数据地址和大小
      --> bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)//读取obj文件,解析elf中section信息。
            -->obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name);//创建并初始化obj结构体用于存放object文件信息
            -->err = bpf_object__elf_init(obj);//读取elf文件并检查elf文件是否完整
            -->err = err ? : bpf_object__check_endianness(obj);//判断ebpf程序可执行文件大小端
            -->err = err ? : bpf_object__elf_collect(obj);//读取elf节信息(license/version/maps/.maps/.BTF/.BTF.ext/.txt/.data等)
            -->err = err ? : bpf_object__collect_externs(obj);//读取btf section
            -->err = err ? : bpf_object_fixup_btf(obj);//读取需要 btf处理的data section
            -->err = err ? : bpf_object__init_maps(obj, opts);//读取map信息
            -->err = err ? : bpf_object_init_progs(obj, opts);//根据ebpf程序中sec()宏传递的参数匹配对应的SEC_DEF
            -->err = err ? : bpf_object__collect_relos(obj);//读取重定位信息

ebpf程序内核态执行的程序编译后会生成一个elf格式的可执行object文件,该文件中包含了编译器指令生成的段,这些段是通过bpf程序调用的SEC()宏生成的,其参数为section的名字,段名定义了 libbpf 程序创建的是什么类型(示例是tracepoint)的 BPF 程序,以及它是附着到内核上哪个地方:

代码语言:javascript复制
# llvm-objdump --section-headers trace_kfree_skb.bpf.o 

trace_kfree_skb.bpf.o:  file format elf64-bpf

Sections:
Idx Name                     Size     VMA              Type
  0                          00000000 0000000000000000 
  1 .strtab                  000000f9 0000000000000000 
  2 .text                    00000000 0000000000000000 TEXT
  3 tracepoint/skb/kfree_skb 000000c8 0000000000000000 TEXT
  4 .rodata.str1.1           0000002c 0000000000000000 DATA
  5 license                  00000004 0000000000000000 DATA
  6 .debug_loc               00000023 0000000000000000 DEBUG
  7 .debug_abbrev            000000e8 0000000000000000 DEBUG
  8 .debug_info              00000134 0000000000000000 DEBUG
  9 .rel.debug_info          000001b0 0000000000000000 
 10 .debug_ranges            00000030 0000000000000000 DEBUG
 11 .debug_str               00000112 0000000000000000 DEBUG
 12 .BTF                     000002e3 0000000000000000 
 13 .rel.BTF                 00000010 0000000000000000 
 14 .BTF.ext                 00000090 0000000000000000 
 15 .rel.BTF.ext             00000060 0000000000000000 
 16 .debug_frame             00000028 0000000000000000 DEBUG
 17 .rel.debug_frame         00000020 0000000000000000 
 18 .debug_line              00000091 0000000000000000 DEBUG
 19 .rel.debug_line          00000010 0000000000000000 
 20 .llvm_addrsig            00000002 0000000000000000 
 21 .symtab                  00000108 00000000000000

通过上面ebpf示例程序编译后生成了一个名为trace_kfree_skb.bpf.o的elf格式的object文件,libbpf库中提供的装载函数会使用这些段的信息。下面是libbpf默认定义的配置,执行libbpf程序时会根据SEC()宏的参数名跟数组section_defs定义的配置名字符串进行比较找到匹配的ebpf类型, libbpf会在执行bpf_object_init_progs函数时根据段的信息决定在后面介绍的attach ebpf程序这一步骤时我们的bpf程序时要执行的动作。

代码语言:javascript复制
//libbpf.c

static const struct bpf_sec_def section_defs[] = {
        SEC_DEF("socket",               SOCKET_FILTER, 0, SEC_NONE),
        SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE),
        SEC_DEF("sk_reuseport",         SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE),
        SEC_DEF("kprobe ",              KPROBE, 0, SEC_NONE, attach_kprobe),
        SEC_DEF("uprobe ",              KPROBE, 0, SEC_NONE, attach_uprobe),
        SEC_DEF("uprobe.s ",            KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
        SEC_DEF("kretprobe ",           KPROBE, 0, SEC_NONE, attach_kprobe),
        SEC_DEF("uretprobe ",           KPROBE, 0, SEC_NONE, attach_uprobe),
        SEC_DEF("uretprobe.s ",         KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
        SEC_DEF("kprobe.multi ",        KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
        SEC_DEF("kretprobe.multi ",     KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
        SEC_DEF("uprobe.multi ",        KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
        SEC_DEF("uretprobe.multi ",     KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
        SEC_DEF("uprobe.multi.s ",      KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi),
        SEC_DEF("uretprobe.multi.s ",   KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi),
        SEC_DEF("ksyscall ",            KPROBE, 0, SEC_NONE, attach_ksyscall),
        SEC_DEF("kretsyscall ",         KPROBE, 0, SEC_NONE, attach_ksyscall),
        SEC_DEF("usdt ",                KPROBE, 0, SEC_USDT, attach_usdt),
        SEC_DEF("usdt.s ",              KPROBE, 0, SEC_USDT | SEC_SLEEPABLE, attach_usdt),
        SEC_DEF("tc/ingress",           SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), /* alias for tcx */
        SEC_DEF("tc/egress",            SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE),  /* alias for tcx */
        SEC_DEF("tcx/ingress",          SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE),
        SEC_DEF("tcx/egress",           SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE),
        SEC_DEF("tc",                   SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */
        SEC_DEF("classifier",           SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */
        SEC_DEF("action",               SCHED_ACT, 0, SEC_NONE), /* deprecated / legacy, use tcx */
        SEC_DEF("tracepoint ",          TRACEPOINT, 0, SEC_NONE, attach_tp),
        SEC_DEF("tp ",                  TRACEPOINT, 0, SEC_NONE, attach_tp),
        SEC_DEF("raw_tracepoint ",      RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
        ......
        ......
        };
        
/*我们的示例程序中SEC()宏用的是SEC("tracepoint/skb/kfree_skb"),所以对应的是section_defs数组里的
SEC_DEF("tracepoint ",TRACEPOINT,0,SEC_NONE, attach_tp),
因此bpf_sec_def成员prog_attach_fn函数指针赋值为TRACEPOINT类型定义的attach_tp函数*/

#define SEC_DEF(sec_pfx, ptype, atype, flags, ...) {                        
        .sec = (char *)sec_pfx,                                             
        .prog_type = BPF_PROG_TYPE_##ptype,                                 
        .expected_attach_type = atype,                                      
        .cookie = (long)(flags),                                            
        .prog_prepare_load_fn = libbpf_prepare_prog_load,                   
        __VA_ARGS__                                                         
}

struct bpf_sec_def {
        char *sec;
        enum bpf_prog_type prog_type;
        enum bpf_attach_type expected_attach_type;
        long cookie;
        int handler_id;

        libbpf_prog_setup_fn_t prog_setup_fn;
        libbpf_prog_prepare_load_fn_t prog_prepare_load_fn;
        libbpf_prog_attach_fn_t prog_attach_fn;
};



        

2. 加载bpf程序编译成的object文件到内核

代码语言:javascript复制
//trace_kfree_skb_bpf__load为bpftool gen skeleton trace_kfree_skb.bpf.o自动生成的函数
static inline int
trace_kfree_skb_bpf__load(struct trace_kfree_skb_bpf *obj)
{
        return bpf_object__load_skeleton(obj->skeleton);
}

//bpf_object__load_skeleton为libbpf提供的库函数
int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
   -->bpf_object__load(*s->obj);//bpf_object__load(struct bpf_object *obj)加载第一步生成的obj结构体
      -->bpf_object_load(obj, 0, NULL);
         err = bpf_object__probe_loading(obj);//bpf系统调用syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载bpf程序代码到内核
         err = err ? : bpf_object__load_vmlinux_btf(obj, false);//读取内核vmlinux信息
         err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);//读取内核kconfig/kallsysm/vmlinux btf信息
         err = err ? : bpf_object__sanitize_and_load_btf(obj);//syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载btf信息到内核
         err = err ? : bpf_object__sanitize_maps(obj);//判断内核支持的map种类
         err = err ? : bpf_object__init_kern_struct_ops_maps(obj);//初始化bpf_map结构的相关字段
         err = err ? : bpf_object__create_maps(obj);//syscall(__NR_bpf,BPF_MAP_CREATE, attr, size)创建map
         err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);//处理bpf代码重定位信息
         err = err ? : bpf_object__load_progs(obj, extra_log_level);//syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载经过重定位、btf修改的bpf代码
         err = err ? : bpf_object_init_prog_arrays(obj);
         err = err ? : bpf_object_prepare_struct_ops(obj);
 

3. 挂载ebpf程序的handler函数(例子里的tp_kfree_skb)指定的event tracepoint

代码语言:javascript复制
//trace_kfree_skb_bpf__attach为bpftool gen skeleton trace_kfree_skb.bpf.o自动生成的函数
static inline int
trace_kfree_skb_bpf__attach(struct trace_kfree_skb_bpf *obj)
{
        return bpf_object__attach_skeleton(obj->skeleton);
}

//bpf_object__attach_skeleton为libbpf的库函数

int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
{
        int i, err;

        for (i = 0; i < s->prog_cnt; i  ) {
   /*trace_kfree_skb_bpf__create_skeleton函数中s->progs[0].prog赋值为&obj->progs.trace_kfree_skb;
   s->progs[0].link赋值为&obj->links.trace_kfree_skb;  前面ebpf示例程序未
   定义obj->links.trace_kfree_skb和obj->progs.trace_kfree_skb,所以这里*prog 和 *link为空*/
                struct bpf_program *prog = *s->progs[i].prog;
                struct bpf_link **link = s->progs[i].link;

                if (!prog->autoload || !prog->autoattach)
                        continue;

                /* auto-attaching not supported for this program */
                if (!prog->sec_def || !prog->sec_def->prog_attach_fn)
                        continue;

                /* if user already set the link manually, don't attempt auto-attach */
                if (*link)
                        continue;
 /*prog_attach_fn是在前面介绍的定义section_defs数组完成的赋值,在我们的例子中对
 对应SEC_DEF("tracepoint ",TRACEPOINT, 0, SEC_NONE, attach_tp),prog_attach_fn对应attach_tp*/
                err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link);
                if (err) {
                        pr_warn("prog '%s': failed to auto-attach: %dn",
                                bpf_program__name(prog), err);
                        return libbpf_err(err);
                }

                /* It's possible that for some SEC() definitions auto-attach
                 * is supported in some cases (e.g., if definition completely
                 * specifies target information), but is not in other cases.
                 * SEC("uprobe") is one such case. If user specified target
                 * binary and function name, such BPF program can be
                 * auto-attached. But if not, it shouldn't trigger skeleton's
                 * attach to fail. It should just be skipped.
                 * attach_fn signals such case with returning 0 (no error) and
                 * setting link to NULL.
                 */
        }

        return 0;
}


/*attach_tp主要完成tracepoint注册以及使能并将ebpf程序handler函数挂载到指定的event tracepoint上
perf_event_open_tracepoint通过/sys/kernel/debug/tracing/events/skb/kfree_skb/id获取event id,
再通过系统调用__NR_perf_event_open注册和使能对应的tracepoint __tracepoint_##name也就是我们
示例用到的__tracepoint_kfree_skb,并返回对应的perf event 的文件描述符pfd.
ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)将ebpf程序与对应的tracepoint关联上,当tracepoint
被调用时ebpf程序handler函数也会被执行*/

static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
  ->bpf_program__attach_tracepoint(prog, tp_cat, tp_name);
     ->bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
        ->pfd = perf_event_open_tracepoint(tp_category, tp_name);
                -->pfd = syscall(__NR_perf_event_open,...)//注册tracepoint
        ->link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
                 /*kernel_supports(prog->obj, FEAT_PERF_LINK)检查到内核不支持BPF_LINK_CREATE
                 因此libbpf执行ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd),这里实现将prog_fd对应
                 的ebpf程序跟pfd对应的perf event关联上,从而实现挂载ebpf程序的handler函数tp_kfree_skb
                 到指定要跟踪的tracepoint*/
                 -->ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)
                 -->ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0)//启用文件描述符参数对应的perf event 计数


//strace跟踪trace_kfree_skb系统调用
#strace ./trace_kfree_skb
.....
faccessat(AT_FDCWD, "/sys/kernel/debug/tracing", F_OK) = 0
openat(AT_FDCWD, "/sys/kernel/debug/tracing/events/skb/kfree_skb/id", O_RDONLY|O_CLOEXEC) = 6
fstat(6, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(6, "1409n", 4096)                 = 5
read(6, "", 4096)                       = 0
close(6)     
perf_event_open({type=PERF_TYPE_TRACEPOINT, size=0x88 /* PERF_ATTR_SIZE_??? */, config=1409, sample_period=0, sample_type=0, read_format=0, precise_ip=0 /* arbitrary skid */, ...}, -1, 0, -1, PERF_FLAG_FD_CLOEXEC) = 6
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x7fff34970d80, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = 7
bpf(BPF_LINK_CREATE, {link_create={prog_fd=7, target_fd=-1, attach_type=BPF_PERF_EVENT, flags=0, perf_event={bpf_cookie=0}}}, 64) = -1 EINVAL (Invalid argument)
close(7)                                = 0
ioctl(6, PERF_EVENT_IOC_SET_BPF, 5)     = 0
ioctl(6, PERF_EVENT_IOC_ENABLE, 0)      = 0

/*tracepoint类型的eBPF程序与kprobe类似,都是基于perf来实现的,
libbpf通过perf_event_open在实现给perf注册tracepoint的时候的调用链为:*/
perf_event_open
  ->perf_event_alloc
      ->perf_init_event
          ->perf_try_init_event
              ->(pmu->event_init(event))//pmu->event_init对应perf_tp_event_init
                 ->perf_trace_init
                     ->perf_trace_event_init 
                         ->perf_trace_event_reg
                             ->tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);//trace_event_reg
                                 ->tracepoint_probe_register     
                                 
/*系统启动初始化时通过调用链perf_event_init->perf_tp_register->perf_pmu_register将
pmu->event_init初始化为perf_tp_event_init*/
static struct pmu perf_tracepoint = {
        .task_ctx_nr    = perf_sw_context,

        .event_init     = perf_tp_event_init,
        .add            = perf_trace_add,
        .del            = perf_trace_del,
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
};

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
//kernel/events/core.c
void __init perf_event_init(void)
{
   ......
   perf_tp_register();
   ......
}

/*p_event->tp_event = tp_event; 实现将一个tracepoint所对应的trace_event_call与perf event相关联,
这样完成perf_event 与trace_event_call的关联,ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)将
ebpf程序与指定的tracepoint绑定时会使用通过perf_event找到对应的trace_event_call*/
*/
static int perf_trace_event_reg(struct trace_event_call *tp_event,
                                struct perf_event *p_event)
{
        struct hlist_head __percpu *list;
        int ret = -ENOMEM;
        int cpu;

        p_event->tp_event = tp_event;
        ......
        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
        if (ret)
                goto fail;

        total_ref_count  ;
        .......
        
}      

/*讲到内核部分绕不开TRACE_EVENT的实现机制,内核通过TRACE_EVENT来定义一个tracepoint,然后
在内核需要调用的地方使用函数trace_xxx来打印输出相关信息,比如kfree_skb调用trace_kfree_skb函数
trace_kfree_skb实现通过TRACE_EVENT(kfree_skb,...)在编译阶段完成定义.
编译后trace_##name会被替换为trace_kfree_skb*/

//include/trace/events/skb.h
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,         skbaddr         )
                __field(        void *,         location        )
                __field(        unsigned short, protocol        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->protocol = ntohs(skb->protocol);
        ),

        TP_printk("skbaddr=%p protocol=%u location=%p",
                __entry->skbaddr, __entry->protocol, __entry->location)
);

//include/linux/tracepoint.h
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) 
        extern struct tracepoint __tracepoint_##name;                   
        static inline void trace_##name(proto)                          
        {                                                               
                if (static_key_false(&__tracepoint_##name.key))         
                        __DO_TRACE(&__tracepoint_##name,                
                                TP_PROTO(data_proto),                   
                                TP_ARGS(data_args),                     
                                TP_CONDITION(cond), 0);                 
                if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {             
                        rcu_read_lock_sched_notrace();                  
                        rcu_dereference_sched(__tracepoint_##name.funcs);
                        rcu_read_unlock_sched_notrace();                
                }                                                       
        } 



/*include/trace/events/skb.h 中一共包含了两个头文件:include/linux/tracepoint.h和
include/trace/define_trace.h,在include/trace/define_trace.h中,
通过#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)语句再次包含include/trace/events/skb.h,
另外通过#include <trace/trace_events.h>和#include <trace/perf.h>包含include/trace/trace_events.h
和include/trace/perf.h,再结合#undef TRACE_EVENT和#define TRACE_EVENT的方式实现了一个
宏定义多次展开的效果*/

/*通过include/linux/tracepoint.h、include/trace/define_trace.h、include/trace/trace_events.h
 和include/trace/perf.h头文件对TRACE_EVENT宏的多次展开后,
 tp_event->class->reg对应trace_event_reg函数*/

//include/trace/trace_events.h
static struct trace_event_class __used __refdata event_class_##call = { 
        .system                 = TRACE_SYSTEM_STRING,                  
        .define_fields          = trace_event_define_fields_##call,     
        .fields                 = LIST_HEAD_INIT(event_class_##call.fields),
        .raw_init               = trace_event_raw_init,                 
        .probe                  = trace_event_raw_event_##call,         
        .reg                    = trace_event_reg,                      
        _TRACE_PERF_INIT(call)                                          
};

//kernel/trace/trace_events.c
int trace_event_reg(struct trace_event_call *call,
                    enum trace_reg type, void *data)
{
        struct trace_event_file *file = data;

        WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
        switch (type) {
        .......
#ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
                return tracepoint_probe_register(call->tp,
                                                 call->class->perf_probe,
                                                 call);
        case TRACE_REG_PERF_UNREGISTER:
                tracepoint_probe_unregister(call->tp,
                                            call->class->perf_probe,
                                            call);
                return 0;
        .......
                return 0;
#endif
        }
        return 0;
}   


/*tracepoint_probe_register(call->tp,call->class->perf_probe,call);参数
call->tp是在perf_trace_init函数通过遍历ftrace_events获取,而从DEFINE_EVENT的定义
可以知道trace_event_call是被放在名为_ftrace_events的section中*/

int perf_trace_init(struct perf_event *p_event)
{
        struct trace_event_call *tp_event;
        u64 event_id = p_event->attr.config;
        int ret = -EINVAL;

        mutex_lock(&event_mutex);
        list_for_each_entry(tp_event, &ftrace_events, list) {
                if (tp_event->event.type == event_id &&
                    tp_event->class && tp_event->class->reg &&
                    try_module_get(tp_event->mod)) {
                        ret = perf_trace_event_init(tp_event, p_event);
                        if (ret)
                                module_put(tp_event->mod);
                        break;
                }
        }
        mutex_unlock(&event_mutex);

        return ret;
}


        
//include/trace/trace_events.h
#define _TRACE_PERF_INIT(call)                                          
        .perf_probe             = perf_trace_##call,  
        
        
//include/trace/trace_events.h

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  
_TRACE_PERF_PROTO(call, PARAMS(proto));                                 
static char print_fmt_##call[] = print;                                 
static struct trace_event_class __used __refdata event_class_##call = { 
        .system                 = TRACE_SYSTEM_STRING,                  
        .define_fields          = trace_event_define_fields_##call,     
        .fields                 = LIST_HEAD_INIT(event_class_##call.fields),
        .raw_init               = trace_event_raw_init,                 
        .probe                  = trace_event_raw_event_##call,         
        .reg                    = trace_event_reg,                      
        _TRACE_PERF_INIT(call)                                          
};

#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args)                       
                                                                        
static struct trace_event_call __used event_##call = {                  
        .class                  = &event_class_##template,              
        {                                                               
                .tp                     = &__tracepoint_##call,         
        },                                                              
        .event.funcs            = &trace_event_type_funcs_##template,   
        .print_fmt              = print_fmt_##template,                 
        .flags                  = TRACE_EVENT_FL_TRACEPOINT,            
};                                                                      
static struct trace_event_call __used                                   
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call          


//_ftrace_events这个section被包含在__start_ftrace_events和__stop_ftrace_events之间

//include/asm-generic/vmlinux.lds.h
#define FTRACE_EVENTS() . = ALIGN(8);                                   
                        VMLINUX_SYMBOL(__start_ftrace_events) = .;      
                        KEEP(*(_ftrace_events))                         
                        VMLINUX_SYMBOL(__stop_ftrace_events) = .;       
                        VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .;   
                        KEEP(*(_ftrace_eval_map))                       
                        VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .;

/*系统启动初始化时会将调用event_trace_enable将__start_ftrace_events和__stop_ftrace_events之间
的_ftrace_events段加到LIST_HEAD(ftrace_events) 上,从而实现perf_trace_init能够通过
遍历&ftrace_events来获取所有的trace_event_call,并通过判断跟libbpf传入的event_id是否匹配来
找到我们ebpf程序要跟踪的perf event*/
static __init int event_trace_enable(void)
{
        struct trace_array *tr = top_trace_array();
        struct trace_event_call **iter, *call;
        int ret;

        if (!tr)        
                return -ENODEV;
        
        for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {

                call = *iter;
                ret = event_init(call);
                if (!ret)
                        list_add(&call->list, &ftrace_events);
        }
        ......
        ......
 }       



/*在我们的示例中perf_trace_##call宏展开后为perf_trace_kfree_skb*/ 
//include/trace/perf.h
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  
static notrace void                                                     
perf_trace_##call(void *__data, proto)                                  
{                                                                       
        struct trace_event_call *event_call = __data;                   
        struct trace_event_data_offsets_##call __maybe_unused __data_offsets;
        struct trace_event_raw_##call *entry;                           
        struct bpf_prog *prog = event_call->prog;                       
        struct pt_regs *__regs;                                         
        u64 __count = 1;                                                
        struct task_struct *__task = NULL;                              
        struct hlist_head *head;                                        
        int __entry_size;                                               
        int __data_size;                                                
        int rctx;                                                       
                                                                        
        __data_size = trace_event_get_offsets_##call(&__data_offsets, args); 
                                                                        
        head = this_cpu_ptr(event_call->perf_events);                   
        if (!prog && __builtin_constant_p(!__task) && !__task &&        
                                hlist_empty(head))                      
                return;                                                 
                                                                        
        __entry_size = ALIGN(__data_size   sizeof(*entry)   sizeof(u32),
                             sizeof(u64));                              
        __entry_size -= sizeof(u32);                                    
                                                                        
        entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);     
        if (!entry)                                                     
                return;                                                 
                                                                        
        perf_fetch_caller_regs(__regs);                                 
                                                                        
        tstruct                                                         
                                                                        
        { assign; }                                                     
                                                                        
        perf_trace_run_bpf_submit(entry, __entry_size, rctx,            
                                  event_call, __count, __regs,          
                                  head, __task);                        
}
        
                        

/*从上面的分析可以知道trace_event_reg调用执行
tracepoint_probe_register(call->tp,call->class->perf_probe,call)时第一个
参数struct tracepoint *tp对应的是call->tp,其对应的
是struct trace_event_call __used event_kfree_skb.tp也就是&__tracepoint_kfree_skb。
第二个参数call->class->perf_probe,call->class对应的是
struct trace_event_call __used event_kfree_skb.class也就是&event_class_##template,
&event_class_##template实现在static struct trace_event_class __used __refdata event_class_##call
处定义,event_class_##call成员对应的宏_TRACE_PERF_INIT为.perf_probe = perf_trace_##call,
因此call->class->perf_probe对应的是perf_trace_##call展开后的函数perf_trace_kfree_skb*/

                                                                                                                                                                                                                                                                                                    
int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
        return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
}

/*tp_func初始化后通过tracepoint_add_func加入到tracepoint的funcs的尾部。
__DO_TRACE会遍历tracepoint指针成员funcs指向的所有tracepoint_func,从而获得要调用执行的函数,
在我们的例子中对应perf_trace_kfree_skb以及perf_trace_kfree_skb的第一个参数event_kfree_skb.*/
int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
                                   void *data, int prio)
{
        struct tracepoint_func tp_func;
        int ret;

        mutex_lock(&tracepoints_mutex);
        tp_func.func = probe;//perf_trace_kfree_skb
        tp_func.data = data;//struct trace_event_call event_kfree_skb
        tp_func.prio = prio;
        ret = tracepoint_add_func(tp, &tp_func, prio);
        mutex_unlock(&tracepoints_mutex);
        return ret;
}


/*tracepoint_add_func 这里会将perf_trace_##call展开的perf_trace_kfree_skb加入到
struct tracepoint __tracepoint_##name的成员funcs也就
是struct tracepoint __tracepoint_kfree_skb.funcs尾巴上,另外tracepoint_add_func
调用static_key_slow_inc将__tracepoint_kfree_skb.key加1使能该event tracepoint/

struct tracepoint {
        const char *name;               /* Tracepoint name */
        struct static_key key;
        int (*regfunc)(void);
        void (*unregfunc)(void);
        struct tracepoint_func __rcu *funcs;
};

/*
 * Add the probe function to a tracepoint.
 */
static int tracepoint_add_func(struct tracepoint *tp,
                               struct tracepoint_func *func, int prio)
{
        struct tracepoint_func *old, *tp_funcs;
        int ret;

        if (tp->regfunc && !static_key_enabled(&tp->key)) {
                ret = tp->regfunc();
                if (ret < 0)
                        return ret;
        }

        tp_funcs = rcu_dereference_protected(tp->funcs,
                        lockdep_is_held(&tracepoints_mutex));
        old = func_add(&tp_funcs, func, prio);
        if (IS_ERR(old)) {
                WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
                return PTR_ERR(old);
        }

        /*
         * rcu_assign_pointer has a smp_wmb() which makes sure that the new
         * probe callbacks array is consistent before setting a pointer to it.
         * This array is referenced by __DO_TRACE from
         * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
         * is used.
         */
        rcu_assign_pointer(tp->funcs, tp_funcs);
        if (!static_key_enabled(&tp->key))
                static_key_slow_inc(&tp->key);
        release_probes(old);
        return 0;
}



前面介绍libbpf是通过调用ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)来将prog_fd对应
的ebpf程序跟pfd对应的perf event关联上,进而关联到指定要跟踪的tracepoint上,这里的实现逻辑如下:

/*p_event->tp_event = tp_event 实现将一个tracepoint所对应的trace_event_call与perf event相关联,
这样完成perf_event 与trace_event_call的关联*/
*/
static int perf_trace_event_reg(struct trace_event_call *tp_event,
                                struct perf_event *p_event)
{
        ......
        p_event->tp_event = tp_event;
        ......
}      

ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)调用链路:

ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)
  -->perf_event_set_bpf_prog(event, arg);
       -->perf_event_attach_bpf_prog(event, prog);
         
         
 /*perf_event_attach_bpf_prog下面部分代码实现新添加的ebpf程序的函数加入到
event->tp_event->prog_array尾部上*/
 
 int perf_event_attach_bpf_prog(struct perf_event *event,
                               struct bpf_prog *prog)
{       
     
        .......               
        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        if (old_array &&
            bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
                ret = -E2BIG;
                goto unlock;
        }
 
        ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
        if (ret < 0)
                goto unlock;

        /* set the new array to event->tp_event and set event->prog */
        event->prog = prog;
        rcu_assign_pointer(event->tp_event->prog_array, new_array);
        ......          
}       

    

4. 内核执行添加的ebpf程序调用链:

代码语言:javascript复制
基于前面的分析可以知道最终当内核调用trace_kfree_skb时,trace_kfree_skb会调用__DO_TRACE
遍历并执行&__tracepoint_kfree_skb.func,也就是perf_trace_kfree_skb会被调用,
perf_trace_kfree_skb最终通过调用perf_trace_run_bpf_submit,进而调用trace_call_bpf
遍历执行执行关联到对应tracepoint上的所用ebpf程序,这里通过遍历trace_event_call.prog_array
来实现,从而示例中的tp_kfree_skb也就被调用,相关调用链如下:

trace_kfree_skb //trace_##name展开而来
  -->__DO_TRACE
      -->perf_trace_kfree_skb//perf_trace_##call展开而来
           -->perf_trace_run_bpf_submit
               -->trace_call_bpf
                //BPF_PROG_RUN_ARRAY_CHECK遍历trace_event_call.prog_array所有挂载到该tracepoint的bpf程序
                    -->BPF_PROG_RUN_ARRAY_CHECK
                         -->tp_kfree_skb//执行前面例子ebpf程序handler函数
                         
//include/linux/tracepoint.h
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) 
        extern struct tracepoint __tracepoint_##name;                   
        static inline void trace_##name(proto)                          
        {                                                               
                if (static_key_false(&__tracepoint_##name.key))         
                        __DO_TRACE(&__tracepoint_##name,                
                                TP_PROTO(data_proto),                   
                                TP_ARGS(data_args),                     
                                TP_CONDITION(cond), 0);                 
                if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {             
                        rcu_read_lock_sched_notrace();                  
                        rcu_dereference_sched(__tracepoint_##name.funcs);
                        rcu_read_unlock_sched_notrace();                
                }                                                       
        } 

//kernel/events/core.c
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}


//include/linux/bpf.h
#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) 
        ({                                              
                struct bpf_prog_array_item *_item;      
                struct bpf_prog *_prog;                 
                struct bpf_prog_array *_array;          
                u32 _ret = 1;                           
                preempt_disable();                      
                rcu_read_lock();                        
                _array = rcu_dereference(array);        
                if (unlikely(check_non_null && !_array))
                        goto _out;                      
                _item = &_array->items[0];              
                while ((_prog = READ_ONCE(_item->prog))) {              
                        if (set_cg_storage)             
                                bpf_cgroup_storage_set(_item->cgroup_storage);  
                        _ret &= func(_prog, ctx);       
                        _item  ;                        
                }                                       
_out:                                                   
                rcu_read_unlock();                      
                preempt_enable();                       
                _ret;                                   
         })

#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)      
        __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)


//kernel/trace/bpf_trace.c

/**
 * trace_call_bpf - invoke BPF program
 * @call: tracepoint event
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
   ......
   ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
   ......
}

trace_call_bpf(call, raw_data)的返回值取决于我们定义的ebpf内核态程序的函数返回值,对应到前面的示例
就是tp_kfree_skb函数的返回值。当tp_kfree_skb返回值为0时perf_tp_event将不会被执行。因此如果tp_kfree_skb
返回值为0时,当我们同时执行我们的ebpf程序trace_kfree_skb以及perf trace -e skb:kfree_skb时,
这时候perf trace -e skb:kfree_skb将无法获取到消息输出。 

阅读代码过程参考了如下资料:

https://github.com/libbpf/libbpf-bootstrap

https://blog.csdn.net/qq_17045267/article/details/125642103

https://terenceli.github.io/技术/2020/08/09/ebpf-with-tracepoint

https://richardweiyang-2.gitbook.io/kernel-exploring/00-index-3/02-trace_event

https://www.ebpf.top/categories/BPF-CORE/

0 人点赞