本文基于libbpf实现的ebpf例子介绍tracepoint类型ebpf程序调用流程,内核实现以5.4版本为例进行介绍。
一. 基于libbpf实现一个跟踪kfree_skb的tracepoint类型ebpf示例:
libbpf/bpftool项目地址:https://github.com/libbpf/libbpf,libbpf提供了一些加载bpf程序的方法,封装了内核提供的bpf()系统调用,帮助我们省去了大部分关于bpf本身的逻辑。
1. 环境准备
内核支持CONFIG_DEBUG_INFO_BTF且安装相关的工具包:
代码语言:javascript复制zgrep "CONFIG_DEBUG_INFO_BTF=y" /proc/config.gz //检查内核支持CONFIG_DEBUG_INFO_BTF
yum install bpftool libbpf-devel llvm-libs llvm-devel llvm -y
下载编译libbpf:
代码语言:javascript复制
git clone https://github.com/libbpf/libbpf.git
cd libpf/src
make
编译后生成libbpf.so libbpf.so.1 libbpf.so.1.3.0 libbpf.a库文件:
# pwd
/root/ebpf-example/libbpf/src
# ls
bpf.c bpf_helper_defs.h btf_dump.c hashmap.h libbpf.h libbpf.pc.template libbpf_version.h nlattr.h skel_internal.h strset.h
bpf_core_read.h bpf_helpers.h btf.h libbpf.a libbpf_internal.h libbpf_probes.c linker.c relo_core.c staticobjs usdt.bpf.h
bpf_endian.h bpf_prog_linfo.c elf.c libbpf.c libbpf_legacy.h libbpf.so Makefile relo_core.h str_error.c usdt.c
bpf_gen_internal.h bpf_tracing.h gen_loader.c libbpf_common.h libbpf.map libbpf.so.1 netlink.c ringbuf.c str_error.h zip.c
bpf.h btf.c hashmap.c libbpf_errno.c libbpf.pc libbpf.so.1.3.0 nlattr.c sharedobjs strset.c zip.h
后面程序使用的是<bpf/libbpf.h>,需要在/root/ebpf-example/libbpf/src创建bpf子目录,并将头文件拷贝
到bpf目录下,不然会直接使用系统自带的libpf头文件。 当然也可以存放到其他路径下,编译时-I -L参数指定下头文件和库相应路径。
#mkdir bpf
# cp *.h bpf
# ls
bpf_core_read.h bpf_gen_internal.h bpf_helper_defs.h bpf_tracing.h hashmap.h libbpf.h libbpf_legacy.h nlattr.h skel_internal.h strset.h zip.h
bpf_endian.h bpf.h bpf_helpers.h btf.h libbpf_common.h libbpf_internal.h libbpf_version.h relo_core.h str_error.h usdt.bpf.h
编译libbpf时也可以通过环境变量指定编译后库文件和头文件的安装路径:
#cd libpf
#make -C src/ clean
#BUILD_STATIC_ONLY=y OBJDIR=./ INCLUDEDIR=./ make -C src/ install
编译安装后libbpf.a在libpf/src目录下:
#cd libpf
# find | grep libbpf.a
./src/libbpf.a
头文件在libpf/src和libpf/src/bpf下
# find | grep -w bpf.h
./include/uapi/linux/bpf.h
./src/bpf/usdt.bpf.h
./src/bpf/bpf.h
./src/usdt.bpf.h
./src/bpf.h
2. 构建并编译epbf程序在内核态执行的代码:
代码语言:javascript复制//bpftool读取 vmlinux 文件并生成对应的 vmlinux.h 头文件。
bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
/*以kfree_skb为例,内核使用TRACE_EVENT定义了名为kfree_skb的tracepoint,
在eBP程序里我们需要获取tracepoint的参数的话,需要按照同样的结构体格式
来访问entry里的数据。可以通过tracepoint的format来查看entry的数据结构.*/
# cat /sys/kernel/debug/tracing/events/skb/kfree_skb/format
name: kfree_skb
ID: 1409
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:void * skbaddr; offset:8; size:8; signed:0;
field:void * location; offset:16; size:8; signed:0;
field:unsigned short protocol; offset:24; size:2; signed:0;
print fmt: "skbaddr=%p protocol=%u location=%p", REC->skbaddr, REC->protocol, REC->location
/*tracepoint的format输出信息描述的是entry结构体,tracepoint的前8个字节都是
统一记录struct trace_entry的信息,struct trace_entry占用8个字节,
因此8个字节后的才是每个tracepoint自定义的参数
struct trace_entry {
unsigned short type;
unsigned char flags;
unsigned char preempt_count;
int pid;
}
*/
/*
TRACE_EVENT(kfree_skb,......) 中TP_STRUCT__entry定义的才是kfree_skb的tracepoint能获取的参数。
TP_STRUCT__entry(
__field(void *, skbaddr)
__field(void *, location)
__field(unsigned short, protocol)
__field(enum skb_drop_reason, reason)
),
*/
//epbf程序内核态要执行的代码:
# cat trace_kfree_skb.bpf.c
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
struct args_kfree_skb {//根据tracing/events/skb/kfree_skb/format输出格式定义
void *regs;//对应struct trace_entry
void *skbaddr;
void *location;
unsigned short protocol;
};
SEC("tracepoint/skb/kfree_skb")
int tp_kfree_skb(struct args_kfree_skb *args)
{
bpf_printk("hello test bpf :skbaddr:%llx,location:%llxn", args->skbaddr,args->location);
return 1;/*这里返回值如果为0会导致当执行perf trace -e skb:kfree_skb 无法获取到信息,
后面会在介绍内核perf_trace_run_bpf_submit()函数时说明原因*/
}
char LICENSE[] SEC("license") = "GPL";
//编译trace_kfree_skb.bpf.c, /root/ebpf-example/libbpf/src为前面libbpf代码所在路径:
clang -g -O3 -target bpf -D__TARGET_ARCH_x86 -I /root/ebpf-example/libbpf/src -c trace_kfree_skb.bpf.c -o trace_kfree_skb.bpf.o
或者
clang -g -O3 -target bpf -D__TARGET_ARCH_x86_64 -I /root/ebpf-example/libbpf/src -c trace_kfree_skb.bpf.c -o trace_kfree_skb.bpf.o
3. 构建并编译epbf程序在用户态执行的代码:
代码语言:javascript复制/*基于前面ebpf程序编译出来的trace_kfree_skb.bpf.o构建skeleton头文件,ebpf例子程序的
用户态程序会调用头文件里定义的函数来调用libpf提供的接口将ebpf例子程序的内核部分代码加载
到内核并关联到我们要trace的tracepoint上
生成的 BPF skeleton 有相应的函数来实现每个阶段的触发:
<name>__open() – 创建并打开 BPF 应用程序;
<name>__load() – 实例化、加载和验证 BPF 应用程序部分;
<name>__attach() – 附加所有可自动附加的 BPF 程序(它是可选的,你可以通过直接使用 libbpf API 获得更多控制);
<name>__destroy() – 分离所有BPF 程序并释放所有使用的资源。
*/
bpftool gen skeleton trace_kfree_skb.bpf.o > trace_kfree_skb.skel.h
//epbf程序用户态要执行的代码:
#include <stdio.h>
#include <unistd.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include "trace_kfree_skb.skel.h"
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
return vfprintf(stderr, format, args);
}
int main(int argc, char **argv)
{
struct trace_kfree_skb_bpf *skel;
int err;
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
/* Set up libbpf errors and debug info callback */
libbpf_set_print(libbpf_print_fn);
/* Open BPF application */
skel = trace_kfree_skb_bpf__open();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeletonn");
return 1;
}
/* Load & verify BPF programs */
err = trace_kfree_skb_bpf__load(skel);
if (err) {
fprintf(stderr, "Failed to load and verify BPF skeletonn");
goto cleanup;
}
/* Attach tracepoint handler */
err = trace_kfree_skb_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeletonn");
goto cleanup;
}
printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` or run 'bpftool prog tracelog'"
"to see output of the BPF programs.n");
for (;;) {
sleep(60);
}
cleanup:
trace_kfree_skb_bpf__destroy(skel);
return 0;
}
/*编译trace_kfree_skb.c, /root/ebpf-example/libbpf/src为前面
libbpf代码以及libbpf的库文件所在路径,使用静态库libbpf.a:*/
clang -g -O2 trace_kfree_skb.c -I /root/ebpf-example/libbpf/src -L /root/ebpf-example/libbpf/src -l:libbpf.a -lelf -lz -o trace_kfree_skb
4. 执行构建的ebpf程序:
代码语言:javascript复制
//执行ebpf程序
./trace_kfree_skb
/*可通过bpftool prog tracelog或者cat /sys/kernel/debug/tracing/trace_pipe查看
bpf_printk输出信息,使用cat /sys/kernel/debug/tracing/trace_pipe查看时需要
确保/sys/kernel/debug/tracing/tracing_on是打开的。*/
# bpftool prog tracelog
sh-2740707 [001] .... 1566460.533201: 0: hello test bpf :skbaddr:ffff88806044f000
sh-2740707 [001] .... 1566460.533208: 0: hello test bpf :skbaddr:ffff88806044f000
......
......
//bpftool查看加载的ebpf程序信息
# bpftool prog show
450: tracepoint name tp_kfree_skb tag 9e46ea8015d2c45f gpl
loaded_at 2023-09-15T21:27:51 0800 uid 0
xlated 200B jited 143B memlock 4096B
btf_id 322
# bpftool prog show id 450
450: tracepoint name tp_kfree_skb tag 9e46ea8015d2c45f gpl
loaded_at 2023-09-15T21:27:51 0800 uid 0
xlated 200B jited 143B memlock 4096B
btf_id 322
# bpftool btf dump id 322
450: tracepoint name tp_kfree_skb tag 9e46ea8015d2c45f gpl
loaded_at 2023-09-15T21:27:51 0800 uid 0
xlated 200B jited 143B memlock 4096B
btf_id 322
[root@VM-2-48-tencentos ebpf-example]# bpftool btf dump id 322
[1] PTR '(anon)' type_id=2
[2] STRUCT 'args_kfree_skb' size=32 vlen=4
'regs' type_id=3 bits_offset=0
'skbaddr' type_id=3 bits_offset=64
'location' type_id=3 bits_offset=128
'protocol' type_id=4 bits_offset=192
[3] PTR '(anon)' type_id=0
[4] INT 'unsigned short' size=2 bits_offset=0 nr_bits=16 encoding=(none)
[5] FUNC_PROTO '(anon)' ret_type_id=6 vlen=1
'args' type_id=1
[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
[7] FUNC 'tp_kfree_skb' type_id=5 linkage=global
[8] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[9] ARRAY '(anon)' type_id=8 index_type_id=10 nr_elems=4
[10] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none)
[11] VAR 'LICENSE' type_id=9, linkage=global-alloc
[12] DATASEC 'license' size=4 vlen=1
type_id=11 offset=0 size=4
# bpftool btf dump id 307 format c
#ifndef __VMLINUX_H__
#define __VMLINUX_H__
#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
#endif
struct args_kfree_skb {
void *regs;
void *skbaddr;
void *location;
unsigned short protocol;
};
#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute pop
#endif
#endif /* __VMLINUX_H__ */
二. ebpf程序被调用执行的流程
从前面的例子在用户态执行的代码trace_kfree_skb.c可以知道使用libbpf来开发ebpf程序时用户态执行的程序主要通过调用前面通过“bpftool gen skeleton trace_kfree_skb.bpf.o > trace_kfree_skb.skel.h”生成的trace_kfree_skb.skel.h提供的函数来实现解析eppf程序obj文件的elf section、加载object文件到内核以及把程序挂到事件的钩子上。
下面基于前面的例子介绍下这三个步骤主要完成的工作:
1. trace_kfree_skb_bpf__open加载ojbect文件trace_kfree_skb.bpf.o并处理elf section信息:
代码语言:javascript复制//trace_kfree_skb.skel.h
tatic inline struct trace_kfree_skb_bpf *
trace_kfree_skb_bpf__open_opts(const struct bpf_object_open_opts *opts)
{
struct trace_kfree_skb_bpf *obj;
obj = (struct trace_kfree_skb_bpf *)calloc(1, sizeof(*obj));
if (!obj)
return NULL;
if (trace_kfree_skb_bpf__create_skeleton(obj))
goto err;
if (bpf_object__open_skeleton(obj->skeleton, opts))
goto err;
return obj;
err:
trace_kfree_skb_bpf__destroy(obj);
return NULL;
}
//s->name指定的是当epbf程序SEC()指定的监听事件被执行时我们的程序要执行的函数名。
//s->data_sz 和s->data分别对应的是ebpf程序的object文件trace_kfree_skb.bpf.o大小和文件数据。
static inline int
trace_kfree_skb_bpf__create_skeleton(struct trace_kfree_skb_bpf *obj)
{
struct bpf_object_skeleton *s;
s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
if (!s)
return -1;
obj->skeleton = s;
s->sz = sizeof(*s);
s->name = "trace_kfree_skb_bpf";
s->obj = &obj->obj;
/* programs */
s->prog_cnt = 1;
s->prog_skel_sz = sizeof(*s->progs);
s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
if (!s->progs)
goto err;
s->progs[0].name = "tp_kfree_skb";
s->progs[0].prog = &obj->progs.trace_kfree_skb;
s->progs[0].link = &obj->links.trace_kfree_skb;
s->data_sz = 4808;
s->data = (void *)"
x7fx45x4cx46x02x01x01 x01 xf7 x01
x48x0d x40 x40 x16
x01 xb7x02 x6cx78x0a x63x2axf8xff x18x02 x61x74
x69x6f x6ex3ax25x6cx7bx2axf0xff x18x02 x25x6c
x6cx78 x2cx6cx6fx63x7bx2axe8xff x18x02 x73x6b
.......
.......;
return 0;
err:
bpf_object__destroy_skeleton(s);
return -1;
}
//bpf_object__open_skeleton是libbpf提供的库函数:
int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
const struct bpf_object_open_opts *opts)
-->bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
const struct bpf_object_open_opts *opts)//obj_buf和obj_buf_sz传递的参数是object文件数据地址和大小
--> bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)//读取obj文件,解析elf中section信息。
-->obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name);//创建并初始化obj结构体用于存放object文件信息
-->err = bpf_object__elf_init(obj);//读取elf文件并检查elf文件是否完整
-->err = err ? : bpf_object__check_endianness(obj);//判断ebpf程序可执行文件大小端
-->err = err ? : bpf_object__elf_collect(obj);//读取elf节信息(license/version/maps/.maps/.BTF/.BTF.ext/.txt/.data等)
-->err = err ? : bpf_object__collect_externs(obj);//读取btf section
-->err = err ? : bpf_object_fixup_btf(obj);//读取需要 btf处理的data section
-->err = err ? : bpf_object__init_maps(obj, opts);//读取map信息
-->err = err ? : bpf_object_init_progs(obj, opts);//根据ebpf程序中sec()宏传递的参数匹配对应的SEC_DEF
-->err = err ? : bpf_object__collect_relos(obj);//读取重定位信息
ebpf程序内核态执行的程序编译后会生成一个elf格式的可执行object文件,该文件中包含了编译器指令生成的段,这些段是通过bpf程序调用的SEC()宏生成的,其参数为section的名字,段名定义了 libbpf 程序创建的是什么类型(示例是tracepoint)的 BPF 程序,以及它是附着到内核上哪个地方:
代码语言:javascript复制# llvm-objdump --section-headers trace_kfree_skb.bpf.o
trace_kfree_skb.bpf.o: file format elf64-bpf
Sections:
Idx Name Size VMA Type
0 00000000 0000000000000000
1 .strtab 000000f9 0000000000000000
2 .text 00000000 0000000000000000 TEXT
3 tracepoint/skb/kfree_skb 000000c8 0000000000000000 TEXT
4 .rodata.str1.1 0000002c 0000000000000000 DATA
5 license 00000004 0000000000000000 DATA
6 .debug_loc 00000023 0000000000000000 DEBUG
7 .debug_abbrev 000000e8 0000000000000000 DEBUG
8 .debug_info 00000134 0000000000000000 DEBUG
9 .rel.debug_info 000001b0 0000000000000000
10 .debug_ranges 00000030 0000000000000000 DEBUG
11 .debug_str 00000112 0000000000000000 DEBUG
12 .BTF 000002e3 0000000000000000
13 .rel.BTF 00000010 0000000000000000
14 .BTF.ext 00000090 0000000000000000
15 .rel.BTF.ext 00000060 0000000000000000
16 .debug_frame 00000028 0000000000000000 DEBUG
17 .rel.debug_frame 00000020 0000000000000000
18 .debug_line 00000091 0000000000000000 DEBUG
19 .rel.debug_line 00000010 0000000000000000
20 .llvm_addrsig 00000002 0000000000000000
21 .symtab 00000108 00000000000000
通过上面ebpf示例程序编译后生成了一个名为trace_kfree_skb.bpf.o的elf格式的object文件,libbpf库中提供的装载函数会使用这些段的信息。下面是libbpf默认定义的配置,执行libbpf程序时会根据SEC()宏的参数名跟数组section_defs定义的配置名字符串进行比较找到匹配的ebpf类型, libbpf会在执行bpf_object_init_progs函数时根据段的信息决定在后面介绍的attach ebpf程序这一步骤时我们的bpf程序时要执行的动作。
代码语言:javascript复制//libbpf.c
static const struct bpf_sec_def section_defs[] = {
SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE),
SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE),
SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE),
SEC_DEF("kprobe ", KPROBE, 0, SEC_NONE, attach_kprobe),
SEC_DEF("uprobe ", KPROBE, 0, SEC_NONE, attach_uprobe),
SEC_DEF("uprobe.s ", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
SEC_DEF("kretprobe ", KPROBE, 0, SEC_NONE, attach_kprobe),
SEC_DEF("uretprobe ", KPROBE, 0, SEC_NONE, attach_uprobe),
SEC_DEF("uretprobe.s ", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
SEC_DEF("kprobe.multi ", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
SEC_DEF("kretprobe.multi ", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
SEC_DEF("uprobe.multi ", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
SEC_DEF("uretprobe.multi ", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
SEC_DEF("uprobe.multi.s ", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi),
SEC_DEF("uretprobe.multi.s ", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi),
SEC_DEF("ksyscall ", KPROBE, 0, SEC_NONE, attach_ksyscall),
SEC_DEF("kretsyscall ", KPROBE, 0, SEC_NONE, attach_ksyscall),
SEC_DEF("usdt ", KPROBE, 0, SEC_USDT, attach_usdt),
SEC_DEF("usdt.s ", KPROBE, 0, SEC_USDT | SEC_SLEEPABLE, attach_usdt),
SEC_DEF("tc/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), /* alias for tcx */
SEC_DEF("tc/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE), /* alias for tcx */
SEC_DEF("tcx/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE),
SEC_DEF("tcx/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE),
SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */
SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */
SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), /* deprecated / legacy, use tcx */
SEC_DEF("tracepoint ", TRACEPOINT, 0, SEC_NONE, attach_tp),
SEC_DEF("tp ", TRACEPOINT, 0, SEC_NONE, attach_tp),
SEC_DEF("raw_tracepoint ", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
......
......
};
/*我们的示例程序中SEC()宏用的是SEC("tracepoint/skb/kfree_skb"),所以对应的是section_defs数组里的
SEC_DEF("tracepoint ",TRACEPOINT,0,SEC_NONE, attach_tp),
因此bpf_sec_def成员prog_attach_fn函数指针赋值为TRACEPOINT类型定义的attach_tp函数*/
#define SEC_DEF(sec_pfx, ptype, atype, flags, ...) {
.sec = (char *)sec_pfx,
.prog_type = BPF_PROG_TYPE_##ptype,
.expected_attach_type = atype,
.cookie = (long)(flags),
.prog_prepare_load_fn = libbpf_prepare_prog_load,
__VA_ARGS__
}
struct bpf_sec_def {
char *sec;
enum bpf_prog_type prog_type;
enum bpf_attach_type expected_attach_type;
long cookie;
int handler_id;
libbpf_prog_setup_fn_t prog_setup_fn;
libbpf_prog_prepare_load_fn_t prog_prepare_load_fn;
libbpf_prog_attach_fn_t prog_attach_fn;
};
2. 加载bpf程序编译成的object文件到内核
代码语言:javascript复制//trace_kfree_skb_bpf__load为bpftool gen skeleton trace_kfree_skb.bpf.o自动生成的函数
static inline int
trace_kfree_skb_bpf__load(struct trace_kfree_skb_bpf *obj)
{
return bpf_object__load_skeleton(obj->skeleton);
}
//bpf_object__load_skeleton为libbpf提供的库函数
int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
-->bpf_object__load(*s->obj);//bpf_object__load(struct bpf_object *obj)加载第一步生成的obj结构体
-->bpf_object_load(obj, 0, NULL);
err = bpf_object__probe_loading(obj);//bpf系统调用syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载bpf程序代码到内核
err = err ? : bpf_object__load_vmlinux_btf(obj, false);//读取内核vmlinux信息
err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);//读取内核kconfig/kallsysm/vmlinux btf信息
err = err ? : bpf_object__sanitize_and_load_btf(obj);//syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载btf信息到内核
err = err ? : bpf_object__sanitize_maps(obj);//判断内核支持的map种类
err = err ? : bpf_object__init_kern_struct_ops_maps(obj);//初始化bpf_map结构的相关字段
err = err ? : bpf_object__create_maps(obj);//syscall(__NR_bpf,BPF_MAP_CREATE, attr, size)创建map
err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);//处理bpf代码重定位信息
err = err ? : bpf_object__load_progs(obj, extra_log_level);//syscall(__NR_bpf,BPF_PROG_LOAD, attr, size)加载经过重定位、btf修改的bpf代码
err = err ? : bpf_object_init_prog_arrays(obj);
err = err ? : bpf_object_prepare_struct_ops(obj);
3. 挂载ebpf程序的handler函数(例子里的tp_kfree_skb)指定的event tracepoint
代码语言:javascript复制//trace_kfree_skb_bpf__attach为bpftool gen skeleton trace_kfree_skb.bpf.o自动生成的函数
static inline int
trace_kfree_skb_bpf__attach(struct trace_kfree_skb_bpf *obj)
{
return bpf_object__attach_skeleton(obj->skeleton);
}
//bpf_object__attach_skeleton为libbpf的库函数
int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
{
int i, err;
for (i = 0; i < s->prog_cnt; i ) {
/*trace_kfree_skb_bpf__create_skeleton函数中s->progs[0].prog赋值为&obj->progs.trace_kfree_skb;
s->progs[0].link赋值为&obj->links.trace_kfree_skb; 前面ebpf示例程序未
定义obj->links.trace_kfree_skb和obj->progs.trace_kfree_skb,所以这里*prog 和 *link为空*/
struct bpf_program *prog = *s->progs[i].prog;
struct bpf_link **link = s->progs[i].link;
if (!prog->autoload || !prog->autoattach)
continue;
/* auto-attaching not supported for this program */
if (!prog->sec_def || !prog->sec_def->prog_attach_fn)
continue;
/* if user already set the link manually, don't attempt auto-attach */
if (*link)
continue;
/*prog_attach_fn是在前面介绍的定义section_defs数组完成的赋值,在我们的例子中对
对应SEC_DEF("tracepoint ",TRACEPOINT, 0, SEC_NONE, attach_tp),prog_attach_fn对应attach_tp*/
err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link);
if (err) {
pr_warn("prog '%s': failed to auto-attach: %dn",
bpf_program__name(prog), err);
return libbpf_err(err);
}
/* It's possible that for some SEC() definitions auto-attach
* is supported in some cases (e.g., if definition completely
* specifies target information), but is not in other cases.
* SEC("uprobe") is one such case. If user specified target
* binary and function name, such BPF program can be
* auto-attached. But if not, it shouldn't trigger skeleton's
* attach to fail. It should just be skipped.
* attach_fn signals such case with returning 0 (no error) and
* setting link to NULL.
*/
}
return 0;
}
/*attach_tp主要完成tracepoint注册以及使能并将ebpf程序handler函数挂载到指定的event tracepoint上
perf_event_open_tracepoint通过/sys/kernel/debug/tracing/events/skb/kfree_skb/id获取event id,
再通过系统调用__NR_perf_event_open注册和使能对应的tracepoint __tracepoint_##name也就是我们
示例用到的__tracepoint_kfree_skb,并返回对应的perf event 的文件描述符pfd.
ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)将ebpf程序与对应的tracepoint关联上,当tracepoint
被调用时ebpf程序handler函数也会被执行*/
static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
->bpf_program__attach_tracepoint(prog, tp_cat, tp_name);
->bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
->pfd = perf_event_open_tracepoint(tp_category, tp_name);
-->pfd = syscall(__NR_perf_event_open,...)//注册tracepoint
->link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
/*kernel_supports(prog->obj, FEAT_PERF_LINK)检查到内核不支持BPF_LINK_CREATE
因此libbpf执行ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd),这里实现将prog_fd对应
的ebpf程序跟pfd对应的perf event关联上,从而实现挂载ebpf程序的handler函数tp_kfree_skb
到指定要跟踪的tracepoint*/
-->ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)
-->ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0)//启用文件描述符参数对应的perf event 计数
//strace跟踪trace_kfree_skb系统调用
#strace ./trace_kfree_skb
.....
faccessat(AT_FDCWD, "/sys/kernel/debug/tracing", F_OK) = 0
openat(AT_FDCWD, "/sys/kernel/debug/tracing/events/skb/kfree_skb/id", O_RDONLY|O_CLOEXEC) = 6
fstat(6, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(6, "1409n", 4096) = 5
read(6, "", 4096) = 0
close(6)
perf_event_open({type=PERF_TYPE_TRACEPOINT, size=0x88 /* PERF_ATTR_SIZE_??? */, config=1409, sample_period=0, sample_type=0, read_format=0, precise_ip=0 /* arbitrary skid */, ...}, -1, 0, -1, PERF_FLAG_FD_CLOEXEC) = 6
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x7fff34970d80, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = 7
bpf(BPF_LINK_CREATE, {link_create={prog_fd=7, target_fd=-1, attach_type=BPF_PERF_EVENT, flags=0, perf_event={bpf_cookie=0}}}, 64) = -1 EINVAL (Invalid argument)
close(7) = 0
ioctl(6, PERF_EVENT_IOC_SET_BPF, 5) = 0
ioctl(6, PERF_EVENT_IOC_ENABLE, 0) = 0
/*tracepoint类型的eBPF程序与kprobe类似,都是基于perf来实现的,
libbpf通过perf_event_open在实现给perf注册tracepoint的时候的调用链为:*/
perf_event_open
->perf_event_alloc
->perf_init_event
->perf_try_init_event
->(pmu->event_init(event))//pmu->event_init对应perf_tp_event_init
->perf_trace_init
->perf_trace_event_init
->perf_trace_event_reg
->tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);//trace_event_reg
->tracepoint_probe_register
/*系统启动初始化时通过调用链perf_event_init->perf_tp_register->perf_pmu_register将
pmu->event_init初始化为perf_tp_event_init*/
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,
.event_init = perf_tp_event_init,
.add = perf_trace_add,
.del = perf_trace_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
};
static inline void perf_tp_register(void)
{
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
//kernel/events/core.c
void __init perf_event_init(void)
{
......
perf_tp_register();
......
}
/*p_event->tp_event = tp_event; 实现将一个tracepoint所对应的trace_event_call与perf event相关联,
这样完成perf_event 与trace_event_call的关联,ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)将
ebpf程序与指定的tracepoint绑定时会使用通过perf_event找到对应的trace_event_call*/
*/
static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
int ret = -ENOMEM;
int cpu;
p_event->tp_event = tp_event;
......
ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
if (ret)
goto fail;
total_ref_count ;
.......
}
/*讲到内核部分绕不开TRACE_EVENT的实现机制,内核通过TRACE_EVENT来定义一个tracepoint,然后
在内核需要调用的地方使用函数trace_xxx来打印输出相关信息,比如kfree_skb调用trace_kfree_skb函数
trace_kfree_skb实现通过TRACE_EVENT(kfree_skb,...)在编译阶段完成定义.
编译后trace_##name会被替换为trace_kfree_skb*/
//include/trace/events/skb.h
TRACE_EVENT(kfree_skb,
TP_PROTO(struct sk_buff *skb, void *location),
TP_ARGS(skb, location),
TP_STRUCT__entry(
__field( void *, skbaddr )
__field( void *, location )
__field( unsigned short, protocol )
),
TP_fast_assign(
__entry->skbaddr = skb;
__entry->location = location;
__entry->protocol = ntohs(skb->protocol);
),
TP_printk("skbaddr=%p protocol=%u location=%p",
__entry->skbaddr, __entry->protocol, __entry->location)
);
//include/linux/tracepoint.h
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)
extern struct tracepoint __tracepoint_##name;
static inline void trace_##name(proto)
{
if (static_key_false(&__tracepoint_##name.key))
__DO_TRACE(&__tracepoint_##name,
TP_PROTO(data_proto),
TP_ARGS(data_args),
TP_CONDITION(cond), 0);
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {
rcu_read_lock_sched_notrace();
rcu_dereference_sched(__tracepoint_##name.funcs);
rcu_read_unlock_sched_notrace();
}
}
/*include/trace/events/skb.h 中一共包含了两个头文件:include/linux/tracepoint.h和
include/trace/define_trace.h,在include/trace/define_trace.h中,
通过#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)语句再次包含include/trace/events/skb.h,
另外通过#include <trace/trace_events.h>和#include <trace/perf.h>包含include/trace/trace_events.h
和include/trace/perf.h,再结合#undef TRACE_EVENT和#define TRACE_EVENT的方式实现了一个
宏定义多次展开的效果*/
/*通过include/linux/tracepoint.h、include/trace/define_trace.h、include/trace/trace_events.h
和include/trace/perf.h头文件对TRACE_EVENT宏的多次展开后,
tp_event->class->reg对应trace_event_reg函数*/
//include/trace/trace_events.h
static struct trace_event_class __used __refdata event_class_##call = {
.system = TRACE_SYSTEM_STRING,
.define_fields = trace_event_define_fields_##call,
.fields = LIST_HEAD_INIT(event_class_##call.fields),
.raw_init = trace_event_raw_init,
.probe = trace_event_raw_event_##call,
.reg = trace_event_reg,
_TRACE_PERF_INIT(call)
};
//kernel/trace/trace_events.c
int trace_event_reg(struct trace_event_call *call,
enum trace_reg type, void *data)
{
struct trace_event_file *file = data;
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
.......
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
case TRACE_REG_PERF_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
.......
return 0;
#endif
}
return 0;
}
/*tracepoint_probe_register(call->tp,call->class->perf_probe,call);参数
call->tp是在perf_trace_init函数通过遍历ftrace_events获取,而从DEFINE_EVENT的定义
可以知道trace_event_call是被放在名为_ftrace_events的section中*/
int perf_trace_init(struct perf_event *p_event)
{
struct trace_event_call *tp_event;
u64 event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
try_module_get(tp_event->mod)) {
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
module_put(tp_event->mod);
break;
}
}
mutex_unlock(&event_mutex);
return ret;
}
//include/trace/trace_events.h
#define _TRACE_PERF_INIT(call)
.perf_probe = perf_trace_##call,
//include/trace/trace_events.h
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
_TRACE_PERF_PROTO(call, PARAMS(proto));
static char print_fmt_##call[] = print;
static struct trace_event_class __used __refdata event_class_##call = {
.system = TRACE_SYSTEM_STRING,
.define_fields = trace_event_define_fields_##call,
.fields = LIST_HEAD_INIT(event_class_##call.fields),
.raw_init = trace_event_raw_init,
.probe = trace_event_raw_event_##call,
.reg = trace_event_reg,
_TRACE_PERF_INIT(call)
};
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args)
static struct trace_event_call __used event_##call = {
.class = &event_class_##template,
{
.tp = &__tracepoint_##call,
},
.event.funcs = &trace_event_type_funcs_##template,
.print_fmt = print_fmt_##template,
.flags = TRACE_EVENT_FL_TRACEPOINT,
};
static struct trace_event_call __used
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
//_ftrace_events这个section被包含在__start_ftrace_events和__stop_ftrace_events之间
//include/asm-generic/vmlinux.lds.h
#define FTRACE_EVENTS() . = ALIGN(8);
VMLINUX_SYMBOL(__start_ftrace_events) = .;
KEEP(*(_ftrace_events))
VMLINUX_SYMBOL(__stop_ftrace_events) = .;
VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .;
KEEP(*(_ftrace_eval_map))
VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .;
/*系统启动初始化时会将调用event_trace_enable将__start_ftrace_events和__stop_ftrace_events之间
的_ftrace_events段加到LIST_HEAD(ftrace_events) 上,从而实现perf_trace_init能够通过
遍历&ftrace_events来获取所有的trace_event_call,并通过判断跟libbpf传入的event_id是否匹配来
找到我们ebpf程序要跟踪的perf event*/
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
struct trace_event_call **iter, *call;
int ret;
if (!tr)
return -ENODEV;
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
ret = event_init(call);
if (!ret)
list_add(&call->list, &ftrace_events);
}
......
......
}
/*在我们的示例中perf_trace_##call宏展开后为perf_trace_kfree_skb*/
//include/trace/perf.h
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
static notrace void
perf_trace_##call(void *__data, proto)
{
struct trace_event_call *event_call = __data;
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;
struct trace_event_raw_##call *entry;
struct bpf_prog *prog = event_call->prog;
struct pt_regs *__regs;
u64 __count = 1;
struct task_struct *__task = NULL;
struct hlist_head *head;
int __entry_size;
int __data_size;
int rctx;
__data_size = trace_event_get_offsets_##call(&__data_offsets, args);
head = this_cpu_ptr(event_call->perf_events);
if (!prog && __builtin_constant_p(!__task) && !__task &&
hlist_empty(head))
return;
__entry_size = ALIGN(__data_size sizeof(*entry) sizeof(u32),
sizeof(u64));
__entry_size -= sizeof(u32);
entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);
if (!entry)
return;
perf_fetch_caller_regs(__regs);
tstruct
{ assign; }
perf_trace_run_bpf_submit(entry, __entry_size, rctx,
event_call, __count, __regs,
head, __task);
}
/*从上面的分析可以知道trace_event_reg调用执行
tracepoint_probe_register(call->tp,call->class->perf_probe,call)时第一个
参数struct tracepoint *tp对应的是call->tp,其对应的
是struct trace_event_call __used event_kfree_skb.tp也就是&__tracepoint_kfree_skb。
第二个参数call->class->perf_probe,call->class对应的是
struct trace_event_call __used event_kfree_skb.class也就是&event_class_##template,
&event_class_##template实现在static struct trace_event_class __used __refdata event_class_##call
处定义,event_class_##call成员对应的宏_TRACE_PERF_INIT为.perf_probe = perf_trace_##call,
因此call->class->perf_probe对应的是perf_trace_##call展开后的函数perf_trace_kfree_skb*/
int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
}
/*tp_func初始化后通过tracepoint_add_func加入到tracepoint的funcs的尾部。
__DO_TRACE会遍历tracepoint指针成员funcs指向的所有tracepoint_func,从而获得要调用执行的函数,
在我们的例子中对应perf_trace_kfree_skb以及perf_trace_kfree_skb的第一个参数event_kfree_skb.*/
int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
void *data, int prio)
{
struct tracepoint_func tp_func;
int ret;
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;//perf_trace_kfree_skb
tp_func.data = data;//struct trace_event_call event_kfree_skb
tp_func.prio = prio;
ret = tracepoint_add_func(tp, &tp_func, prio);
mutex_unlock(&tracepoints_mutex);
return ret;
}
/*tracepoint_add_func 这里会将perf_trace_##call展开的perf_trace_kfree_skb加入到
struct tracepoint __tracepoint_##name的成员funcs也就
是struct tracepoint __tracepoint_kfree_skb.funcs尾巴上,另外tracepoint_add_func
调用static_key_slow_inc将__tracepoint_kfree_skb.key加1使能该event tracepoint/
struct tracepoint {
const char *name; /* Tracepoint name */
struct static_key key;
int (*regfunc)(void);
void (*unregfunc)(void);
struct tracepoint_func __rcu *funcs;
};
/*
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
int ret;
if (tp->regfunc && !static_key_enabled(&tp->key)) {
ret = tp->regfunc();
if (ret < 0)
return ret;
}
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
* probe callbacks array is consistent before setting a pointer to it.
* This array is referenced by __DO_TRACE from
* include/linux/tracepoints.h. A matching smp_read_barrier_depends()
* is used.
*/
rcu_assign_pointer(tp->funcs, tp_funcs);
if (!static_key_enabled(&tp->key))
static_key_slow_inc(&tp->key);
release_probes(old);
return 0;
}
前面介绍libbpf是通过调用ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)来将prog_fd对应
的ebpf程序跟pfd对应的perf event关联上,进而关联到指定要跟踪的tracepoint上,这里的实现逻辑如下:
/*p_event->tp_event = tp_event 实现将一个tracepoint所对应的trace_event_call与perf event相关联,
这样完成perf_event 与trace_event_call的关联*/
*/
static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
......
p_event->tp_event = tp_event;
......
}
ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)调用链路:
ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd)
-->perf_event_set_bpf_prog(event, arg);
-->perf_event_attach_bpf_prog(event, prog);
/*perf_event_attach_bpf_prog下面部分代码实现新添加的ebpf程序的函数加入到
event->tp_event->prog_array尾部上*/
int perf_event_attach_bpf_prog(struct perf_event *event,
struct bpf_prog *prog)
{
.......
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
if (old_array &&
bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
ret = -E2BIG;
goto unlock;
}
ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
if (ret < 0)
goto unlock;
/* set the new array to event->tp_event and set event->prog */
event->prog = prog;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
......
}
4. 内核执行添加的ebpf程序调用链:
代码语言:javascript复制基于前面的分析可以知道最终当内核调用trace_kfree_skb时,trace_kfree_skb会调用__DO_TRACE
遍历并执行&__tracepoint_kfree_skb.func,也就是perf_trace_kfree_skb会被调用,
perf_trace_kfree_skb最终通过调用perf_trace_run_bpf_submit,进而调用trace_call_bpf
遍历执行执行关联到对应tracepoint上的所用ebpf程序,这里通过遍历trace_event_call.prog_array
来实现,从而示例中的tp_kfree_skb也就被调用,相关调用链如下:
trace_kfree_skb //trace_##name展开而来
-->__DO_TRACE
-->perf_trace_kfree_skb//perf_trace_##call展开而来
-->perf_trace_run_bpf_submit
-->trace_call_bpf
//BPF_PROG_RUN_ARRAY_CHECK遍历trace_event_call.prog_array所有挂载到该tracepoint的bpf程序
-->BPF_PROG_RUN_ARRAY_CHECK
-->tp_kfree_skb//执行前面例子ebpf程序handler函数
//include/linux/tracepoint.h
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)
extern struct tracepoint __tracepoint_##name;
static inline void trace_##name(proto)
{
if (static_key_false(&__tracepoint_##name.key))
__DO_TRACE(&__tracepoint_##name,
TP_PROTO(data_proto),
TP_ARGS(data_args),
TP_CONDITION(cond), 0);
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {
rcu_read_lock_sched_notrace();
rcu_dereference_sched(__tracepoint_##name.funcs);
rcu_read_unlock_sched_notrace();
}
}
//kernel/events/core.c
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct trace_event_call *call, u64 count,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
rctx, task);
}
//include/linux/bpf.h
#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage)
({
struct bpf_prog_array_item *_item;
struct bpf_prog *_prog;
struct bpf_prog_array *_array;
u32 _ret = 1;
preempt_disable();
rcu_read_lock();
_array = rcu_dereference(array);
if (unlikely(check_non_null && !_array))
goto _out;
_item = &_array->items[0];
while ((_prog = READ_ONCE(_item->prog))) {
if (set_cg_storage)
bpf_cgroup_storage_set(_item->cgroup_storage);
_ret &= func(_prog, ctx);
_item ;
}
_out:
rcu_read_unlock();
preempt_enable();
_ret;
})
#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)
__BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
//kernel/trace/bpf_trace.c
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
* Can be used from static tracepoints in the future.
*
* Return: BPF programs always return an integer which is interpreted by
* kprobe handler as:
* 0 - return from kprobe (event is filtered out)
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
......
ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
......
}
trace_call_bpf(call, raw_data)的返回值取决于我们定义的ebpf内核态程序的函数返回值,对应到前面的示例
就是tp_kfree_skb函数的返回值。当tp_kfree_skb返回值为0时perf_tp_event将不会被执行。因此如果tp_kfree_skb
返回值为0时,当我们同时执行我们的ebpf程序trace_kfree_skb以及perf trace -e skb:kfree_skb时,
这时候perf trace -e skb:kfree_skb将无法获取到消息输出。
阅读代码过程参考了如下资料:
https://github.com/libbpf/libbpf-bootstrap
https://blog.csdn.net/qq_17045267/article/details/125642103
https://terenceli.github.io/技术/2020/08/09/ebpf-with-tracepoint
https://richardweiyang-2.gitbook.io/kernel-exploring/00-index-3/02-trace_event
https://www.ebpf.top/categories/BPF-CORE/